From 6b185f0c719c8e6f6f5ffc2c3408ec4b3a686886 Mon Sep 17 00:00:00 2001 From: andrewresearch Date: Sun, 29 Oct 2017 19:15:01 +1000 Subject: [PATCH 1/2] factorie-nlp-api-1 Basic config to support high level api --- .gitignore | 1 + build.sbt | 21 +- .../cc/factorie/app/chain/Lexicons.scala | 7 +- .../cc/factorie/app/nlp/BasicSection.scala | 8 + .../scala/cc/factorie/app/nlp/Document.scala | 159 +- .../app/nlp/DocumentAnnotationPipeline.scala | 36 + .../factorie/app/nlp/DocumentAnnotator.scala | 52 +- .../app/nlp/DocumentAnnotatorPipeline.scala | 96 +- .../cc/factorie/app/nlp/DocumentName.scala | 6 + .../factorie/app/nlp/DocumentSubstring.scala | 16 + .../app/nlp/MutableDocumentAnnotatorMap.scala | 7 + .../scala/cc/factorie/app/nlp/Section.scala | 54 +- .../scala/cc/factorie/app/nlp/Sentence.scala | 61 +- .../scala/cc/factorie/app/nlp/Token.scala | 166 +- .../scala/cc/factorie/app/nlp/TokenSpan.scala | 103 +- .../cc/factorie/app/nlp/TokenSpanBuffer.scala | 6 + .../app/nlp/TokenSpanCollection.scala | 5 + .../cc/factorie/app/nlp/TokenSpanList.scala | 6 + .../cc/factorie/app/nlp/TokenString.scala | 9 + .../app/nlp/UnknownDocumentAnnotator.scala | 9 + .../app/nlp/coref/AbstractEntity.scala | 12 + .../app/nlp/coref/AbstractMention.scala | 13 + .../nlp/coref/AcronymNounPhraseFinder.scala | 26 + .../app/nlp/coref/BaseCorefModel.scala | 10 + .../app/nlp/coref/ConjunctionOptions.scala | 8 + .../app/nlp/coref/CorefConllOutput.scala | 83 - .../app/nlp/coref/CorefFeatures.scala | 158 +- .../factorie/app/nlp/coref/CorefModel.scala | 47 + .../factorie/app/nlp/coref/CorefOptions.scala | 7 - .../factorie/app/nlp/coref/CorefSystem.scala | 135 + .../factorie/app/nlp/coref/CorefTrainer.scala | 347 --- .../app/nlp/coref/CorefTrainerOpts.scala | 13 + .../app/nlp/coref/DefaultHashMap.scala | 7 + .../app/nlp/coref/DeterministicCoref.scala | 2435 ----------------- .../nlp/coref/DeterministicNamedCoref.scala | 52 - .../factorie/app/nlp/coref/ForwardCoref.scala | 419 --- .../app/nlp/coref/ForwardCorefBase.scala | 176 ++ .../nlp/coref/ForwardCorefTrainerOpts.scala | 30 + .../ImplicitFeatureConjunctionTensor.scala | 132 - .../cc/factorie/app/nlp/coref/Mention.scala | 268 +- .../app/nlp/coref/MentionAlignment.scala | 171 -- .../app/nlp/coref/MentionBuffer.scala | 6 + .../nlp/coref/MentionCharacteristics.scala | 68 + .../app/nlp/coref/MentionCollection.scala | 5 + .../factorie/app/nlp/coref/MentionList.scala | 7 + .../app/nlp/coref/MentionPairFeatures.scala | 221 ++ .../app/nlp/coref/MentionPairLabel.scala | 299 +- .../app/nlp/coref/MentionPhraseFinder.scala | 120 +- .../app/nlp/coref/NerForwardCoref.scala | 21 + .../app/nlp/coref/NerPhraseFinder.scala | 11 + .../cc/factorie/app/nlp/coref/Node.scala | 14 + .../app/nlp/coref/PairwiseCorefModel.scala | 227 +- .../app/nlp/coref/ParseForwardCoref.scala | 20 + .../app/nlp/coref/PronounFinder.scala | 17 + .../factorie/app/nlp/coref/PronounSets.scala | 77 + .../app/nlp/coref/StructuredCoreference.scala | 177 -- .../factorie/app/nlp/coref/TokenFreqs.scala | 44 + .../app/nlp/coref/TopTokenFrequencies.scala | 29 + .../app/nlp/coref/WithinDocCoref.scala | 124 + .../app/nlp/coref/WithinDocEntity.scala | 53 + .../factorie/app/nlp/embedding/Browse.scala | 114 - .../cc/factorie/app/nlp/embedding/CBOW.scala | 200 -- .../factorie/app/nlp/embedding/SkipGram.scala | 183 -- .../nlp/embedding/WindowWordEmbedder.scala | 536 ---- .../nlp/embeddings/CBOWEmbeddingModel.scala | 88 - .../app/nlp/embeddings/Distance.scala | 115 - .../app/nlp/embeddings/EmbeddingOpts.scala | 51 - .../nlp/embeddings/LiteHogWildTrainer.scala | 33 - .../nlp/embeddings/SkipGramEmbedding.scala | 1 + .../embeddings/SkipGramEmbeddingModel.scala | 85 - .../app/nlp/embeddings/VocabBuilder.scala | 242 -- .../app/nlp/embeddings/WordEmbedding.scala | 54 - .../nlp/embeddings/WordEmbeddingModel.scala | 143 - .../nlp/embeddings/WordEmbeddingUtils.scala | 102 - .../factorie/app/nlp/embeddings/WordVec.scala | 43 - .../app/nlp/hcoref/CanopyPairGenerator.scala | 135 - .../factorie/app/nlp/hcoref/CorefModel.scala | 57 - .../app/nlp/hcoref/CorefSampler.scala | 123 - .../factorie/app/nlp/hcoref/DebugCoref.scala | 92 - .../app/nlp/hcoref/DebugDiffList.scala | 107 - .../app/nlp/hcoref/DefaultMoveGenerator.scala | 62 - .../hcoref/DeterministicPairGenerator.scala | 97 - .../app/nlp/hcoref/DocEntityCoref.scala | 75 - .../app/nlp/hcoref/DocEntityVars.scala | 119 - .../nlp/hcoref/HierarchicalCorefSampler.scala | 24 - .../app/nlp/hcoref/LinkingScorer.scala | 38 - .../cc/factorie/app/nlp/hcoref/Move.scala | 124 - .../app/nlp/hcoref/NoSplitMoveGenerator.scala | 57 - .../cc/factorie/app/nlp/hcoref/Node.scala | 328 --- .../app/nlp/hcoref/NodeCollection.scala | 200 -- .../app/nlp/hcoref/NodeTemplates.scala | 327 --- .../app/nlp/hcoref/PairGenerator.scala | 36 - .../factorie/app/nlp/hcoref/PostSampler.scala | 234 -- .../cc/factorie/app/nlp/hcoref/TACCoref.scala | 529 ---- .../app/nlp/hcoref/TrainingObjective.scala | 80 - .../factorie/app/nlp/hcoref/Verbosity.scala | 210 -- .../cc/factorie/app/nlp/hcoref/package.scala | 46 - .../nlp/lemma/CollapseDigitsLemmatizer.scala | 5 +- .../nlp/lemma/CollapseDigitsTokenLemma.scala | 5 + .../factorie/app/nlp/lemma/Lemmatizer.scala | 16 - .../app/nlp/lemma/LowercaseLemmatizer.scala | 6 +- .../app/nlp/lemma/LowercaseTokenLemma.scala | 6 + .../app/nlp/lemma/PorterLemmatizer.scala | 12 +- .../app/nlp/lemma/PorterTokenLemma.scala | 6 + .../nlp/lemma/SimplifyDigitsLemmatizer.scala | 6 +- .../nlp/lemma/SimplifyDigitsTokenLemma.scala | 9 + .../factorie/app/nlp/lemma/TokenLemma.scala | 3 +- .../app/nlp/lemma/WordNetLemmatizer.scala | 4 +- .../app/nlp/lemma/WordNetTokenLemma.scala | 9 + .../app/nlp/lexicon/AhoCorasick.scala | 30 +- .../app/nlp/lexicon/CustomStopWords.scala | 19 + .../factorie/app/nlp/lexicon/Determiner.scala | 26 + .../app/nlp/lexicon/GenericLexicon.scala | 9 + .../cc/factorie/app/nlp/lexicon/Lexicon.scala | 397 +-- .../app/nlp/lexicon/LexiconMention.scala | 33 + .../factorie/app/nlp/lexicon/Lexicons.scala | 264 -- .../app/nlp/lexicon/LexiconsProvider.scala | 88 + .../app/nlp/lexicon/MutableLexicon.scala | 19 + .../app/nlp/lexicon/NumberWords.scala | 62 + .../app/nlp/lexicon/PersonPronoun.scala | 44 + .../app/nlp/lexicon/PhraseLexicon.scala | 57 + .../app/nlp/lexicon/PosessiveDeterminer.scala | 14 + .../app/nlp/lexicon/Preposition.scala | 53 + .../cc/factorie/app/nlp/lexicon/Pronoun.scala | 77 + .../app/nlp/lexicon/ProvidedLexicon.scala | 13 + .../lexicon/ProvidedTriePhraseLexicon.scala | 7 + .../app/nlp/lexicon/StaticLexicons.scala | 206 +- .../factorie/app/nlp/lexicon/StopWords.scala | 16 +- .../factorie/app/nlp/lexicon/SuffixNode.scala | 20 + .../factorie/app/nlp/lexicon/SuffixTree.scala | 18 - .../app/nlp/lexicon/TriePhraseLexicon.scala | 73 + .../app/nlp/lexicon/TrieUnionLexicon.scala | 41 + .../app/nlp/lexicon/UnionLexicon.scala | 25 + .../app/nlp/lexicon/iesl/IeslLexicon.scala | 93 +- .../lexicon/iesl/es/IeslSpanishLexicon.scala | 20 - .../lexicon/mandarin/MandarinLexicon.scala | 22 - .../cc/factorie/app/nlp/lexicon/package.scala | 24 - .../app/nlp/lexicon/ssdi/SsdiLexicon.scala | 19 +- .../lexicon/uscensus/UscensusLexicon.scala | 10 +- .../lexicon/wikipedia/WikipediaLexicon.scala | 102 +- .../wikipedia/es/WikipediaLexicon.scala | 16 - .../scala/cc/factorie/app/nlp/load/Load.scala | 9 +- .../cc/factorie/app/nlp/load/LoadACE.scala | 168 -- .../factorie/app/nlp/load/LoadAPFCoref.scala | 324 --- .../factorie/app/nlp/load/LoadConll2000.scala | 214 -- .../factorie/app/nlp/load/LoadConll2002.scala | 118 - .../factorie/app/nlp/load/LoadConll2003.scala | 39 +- .../factorie/app/nlp/load/LoadConll2008.scala | 174 -- .../factorie/app/nlp/load/LoadConll2011.scala | 328 --- .../app/nlp/load/LoadConllCoreference.scala | 351 --- .../factorie/app/nlp/load/LoadDirectory.scala | 10 + .../app/nlp/load/LoadGermeval2014.scala | 93 - .../cc/factorie/app/nlp/load/LoadHTML.scala | 21 - .../app/nlp/load/LoadNYTimesXML.scala | 34 - .../cc/factorie/app/nlp/load/LoadOWPL.scala | 59 - .../app/nlp/load/LoadOntonotes5.scala | 161 -- .../factorie/app/nlp/load/LoadPlainText.scala | 53 - .../cc/factorie/app/nlp/load/LoadReACE.scala | 208 -- .../factorie/app/nlp/load/LoadWSJMalt.scala | 155 -- .../app/nlp/load/LoadWikipediaPlainText.scala | 165 -- .../app/nlp/load/TacFileIterator.scala | 128 - .../app/nlp/load/XMLSectionalizer.scala | 124 - .../morph/BasicMorphologicalAnalyzer.scala | 2 + .../scala/cc/factorie/app/nlp/ner/BILOU.scala | 48 + .../nlp/ner/BilouConllNerChunkAnnotator.scala | 5 + .../app/nlp/ner/BilouConllNerDomain.scala | 13 + .../app/nlp/ner/BilouConllNerTag.scala | 7 + .../ner/BilouOntonotesNerChunkAnnotator.scala | 5 + .../app/nlp/ner/BilouOntonotesNerDomain.scala | 15 + .../app/nlp/ner/BilouOntonotesNerTag.scala | 6 + .../cc/factorie/app/nlp/ner/ChainNer.scala | 61 +- .../factorie/app/nlp/ner/ConllChainNer.scala | 47 + .../factorie/app/nlp/ner/ConllNerDomain.scala | 8 + .../factorie/app/nlp/ner/ConllNerLabel.scala | 4 + .../factorie/app/nlp/ner/ConllNerSpan.scala | 5 + .../app/nlp/ner/ConllNerSpanBuffer.scala | 7 + .../app/nlp/ner/ConllNerSpanLabel.scala | 6 + .../cc/factorie/app/nlp/ner/ConllNerTag.scala | 5 + .../app/nlp/ner/ConllStackedChainNer.scala | 18 + .../app/nlp/ner/LabeledBilouNerTag.scala | 7 + .../nlp/ner/LabeledBilouOntonotesNerTag.scala | 7 + .../app/nlp/ner/LabeledConllNerTag.scala | 6 + .../app/nlp/ner/LabeledOntonotesNerTag.scala | 7 + .../app/nlp/ner/NERChunkAnnotator.scala | 29 +- .../cc/factorie/app/nlp/ner/NerSpan.scala | 14 + .../factorie/app/nlp/ner/NerSpanBuffer.scala | 6 + .../factorie/app/nlp/ner/NerSpanLabel.scala | 10 + .../cc/factorie/app/nlp/ner/NerTag.scala | 326 +-- ...NoEmbeddingsOntonotesStackedChainNer.scala | 8 + .../nlp/ner/NoEmbeddingsStackedChainNer.scala | 9 + .../app/nlp/ner/OntonotesChainNer.scala | 11 + .../nlp/ner/OntonotesEntityTypeDomain.scala | 30 + .../app/nlp/ner/OntonotesNerDomain.scala | 26 + .../app/nlp/ner/OntonotesNerSpan.scala | 5 + .../app/nlp/ner/OntonotesNerSpanBuffer.scala | 5 + .../app/nlp/ner/OntonotesNerSpanLabel.scala | 5 + .../app/nlp/ner/OntonotesNerTag.scala | 7 + .../nlp/ner/OntonotesStackedChainNer.scala | 18 + .../factorie/app/nlp/ner/SpanEncoding.scala | 26 + .../app/nlp/ner/StackedChainNer.scala | 219 +- .../app/nlp/ner/StaticLexiconFeatures.scala | 72 +- .../factorie/app/nlp/ner/TokenSequence.scala | 11 + .../factorie/app/nlp/ner/WellFormedNer.scala | 7 +- .../scala/cc/factorie/app/nlp/package.scala | 16 +- .../app/nlp/parse/CollapsedParseTree.scala | 455 --- .../nlp/parse/LightweightParseSentence.scala | 24 + .../app/nlp/parse/LightweightParseToken.scala | 12 + .../cc/factorie/app/nlp/parse/NullToken.scala | 9 + .../OntonotesTransitionBasedParser.scala | 4 + .../app/nlp/parse/ParseDecision.scala | 9 + .../factorie/app/nlp/parse/ParseState.scala | 123 + .../cc/factorie/app/nlp/parse/ParseTree.scala | 72 +- .../app/nlp/parse/ParseTreeLabel.scala | 7 + .../app/nlp/parse/ParseTreeLabelDomain.scala | 15 + .../app/nlp/parse/ParserConstants.scala | 45 + .../parse/ProjectiveGraphBasedParser.scala | 420 --- .../cc/factorie/app/nlp/parse/RootToken.scala | 9 + .../app/nlp/parse/TransitionBasedParser.scala | 764 ++---- .../app/nlp/phrase/AnyNerPhraseFinder.scala | 7 + .../app/nlp/phrase/ChainChunker.scala | 260 -- .../app/nlp/phrase/ConllEntityType.scala | 11 + .../nlp/phrase/ConllPhraseEntityType.scala | 3 + .../app/nlp/phrase/ConllPhraseFinder.scala | 6 + .../factorie/app/nlp/phrase/DatePhrase.scala | 27 + .../app/nlp/phrase/DatePhraseFinder.scala | 26 +- .../app/nlp/phrase/DatePhraseList.scala | 4 + .../cc/factorie/app/nlp/phrase/Gender.scala | 8 + .../app/nlp/phrase/GenderDomain.scala | 12 + .../app/nlp/phrase/HeadTokenOffset.scala | 43 + .../factorie/app/nlp/phrase/LocatedDate.scala | 3 + .../phrase/MentionPhraseNumberLabeler.scala | 6 + .../app/nlp/phrase/NPChunkMentionFinder.scala | 92 - .../nlp/phrase/NnpPosNounPhraseFinder.scala | 33 + .../phrase/NounPhraseEntityTypeLabeler.scala | 5 + .../app/nlp/phrase/NounPhraseGender.scala | 178 +- .../nlp/phrase/NounPhraseGenderLabeler.scala | 5 + .../app/nlp/phrase/NounPhraseList.scala | 4 + ...er.scala => NounPhraseNumberLabeler.scala} | 42 +- .../app/nlp/phrase/NounPhraseType.scala | 52 +- .../app/nlp/phrase/NounPhraseTypeDomain.scala | 8 + .../cc/factorie/app/nlp/phrase/Number.scala | 9 + .../app/nlp/phrase/NumberDomain.scala | 10 + .../app/nlp/phrase/OntonotesEntityType.scala | 11 + .../phrase/OntonotesPhraseEntityType.scala | 3 + ...=> OntonotesPhraseEntityTypeLabeler.scala} | 111 +- .../nlp/phrase/OntonotesPhraseFinder.scala | 6 + .../phrase/ParseAndNerBasedPhraseFinder.scala | 3 + .../nlp/phrase/ParseBasedMentionList.scala | 5 + ...ion.scala => ParseBasedPhraseFinder.scala} | 29 +- .../cc/factorie/app/nlp/phrase/Phrase.scala | 77 +- .../app/nlp/phrase/PhraseGender.scala | 5 + .../app/nlp/phrase/PhraseGenderLabeler.scala | 157 ++ .../factorie/app/nlp/phrase/PhraseList.scala | 6 + .../app/nlp/phrase/PhraseNumber.scala | 6 + .../nlp/phrase/PosBasedNounPhraseFinder.scala | 49 - .../app/nlp/phrase/VerbPhraseList.scala | 4 + .../factorie/app/nlp/pos/ChainPosTagger.scala | 319 --- .../app/nlp/pos/CtbChainPosTagger.scala | 215 -- .../cc/factorie/app/nlp/pos/CtbPosTag.scala | 81 - .../app/nlp/pos/ForwardPosTagger.scala | 305 +-- .../app/nlp/pos/LabeledPennPosTag.scala | 8 + .../app/nlp/pos/LabeledUniversalPosTag.scala | 9 + .../nlp/pos/OntoNotesForwardPosTagger.scala | 9 + .../factorie/app/nlp/pos/PennPosDomain.scala | 76 + .../cc/factorie/app/nlp/pos/PennPosTag.scala | 16 + .../cc/factorie/app/nlp/pos/PosTag.scala | 251 +- .../app/nlp/pos/UniversalPosDomain.scala | 97 + .../app/nlp/pos/UniversalPosTag.scala | 10 + .../cc/factorie/app/nlp/pos/package.scala | 4 +- .../ConllPatternBasedRelationFinder.scala | 3 + .../OntoNotesPatternBasedRelationFinder.scala | 8 + .../relation/PatterRelationPredictor.scala | 47 + .../relation/PatternBasedRelationFinder.scala | 75 +- .../factorie/app/nlp/relation/Relation.scala | 147 - .../app/nlp/relation/RelationMention.scala | 49 +- .../app/nlp/relation/RelationMentionSeq.scala | 5 + .../app/nlp/relation/TACRelation.scala | 3 + .../app/nlp/relation/TACRelationList.scala | 5 + .../app/nlp/segment/BigramStatistics.scala | 91 - .../app/nlp/segment/BritishToAmerican.scala | 6 + .../segment/ChainChineseWordSegmenter.scala | 424 --- .../nlp/segment/ChineseSegLabelDomains.scala | 117 - .../nlp/segment/DehyphenatingTokenizer.scala | 84 - .../segment/DeterministicLexerTokenizer.scala | 174 +- ...eterministicNormalizingHtmlTokenizer.scala | 24 + .../DeterministicNormalizingTokenizer.scala | 25 + .../segment/DeterministicRegexTokenizer.scala | 9 +- .../DeterministicSentenceSegmenter.scala | 24 +- .../nlp/segment/DeterministicTokenizer.scala | 25 + .../OntonotesNormalizedTokenString.scala | 9 + .../segment/OntonotesTokenNormalizer.scala | 11 + .../app/nlp/segment/PhraseTokenizer.scala | 116 - .../segment/PlainNormalizedTokenString.scala | 5 + .../nlp/segment/PlainTokenNormalizer.scala | 5 + .../nlp/segment/PunktSentenceSegmenter.scala | 749 ----- .../app/nlp/segment/PunktTokenizer.scala | 187 -- ...ormalizer.scala => TokenNormalizer1.scala} | 71 +- .../cc/factorie/app/nlp/wordnet/Synset.scala | 23 + .../cc/factorie/app/nlp/wordnet/WordNet.scala | 22 +- .../factorie/app/strings/PorterStemmer.scala | 9 +- .../cc/factorie/app/strings/package.scala | 6 +- .../cc/factorie/app/topics/lda/Document.scala | 5 +- .../cc/factorie/util/ModelProvider.scala | 6 +- .../util/namejuggler/PersonNameFormat.scala | 2 +- .../factorie/nlp/api/DocumentAnnotator.scala | 41 +- .../factorie/nlp/api/DocumentBuilder.scala | 60 +- src/test/resources/conll-ner-input | 65 - src/test/resources/parser-test-input | 38 - src/test/scala/cc/factorie/TestExamples.scala | 135 - .../scala/cc/factorie/TestSerialize.scala | 359 --- src/test/scala/cc/factorie/TestUtils.scala | 41 - .../app/bib/parser/TestBibtexParser.scala | 518 ---- .../classify/backend/TestClassification.scala | 40 - .../app/classify/backend/TestNaiveBayes.scala | 92 - .../scala/cc/factorie/app/mf/TestWSabie.scala | 44 - .../nlp/TestCompoundDocumentAnnotator.scala | 38 - .../app/nlp/TestDocumentAnnotatorMap.scala | 103 - .../factorie/app/nlp/TestDocumentStore.scala | 55 - .../cc/factorie/app/nlp/TokenSpanTests.scala | 40 - .../cc/factorie/app/nlp/TokenTests.scala | 37 - .../app/nlp/lexicon/TestLexicon.scala | 150 - .../nlp/lexicon/TestTriePhraseLexicon.scala | 51 - .../factorie/app/nlp/ner/TestNerTaggers.scala | 45 - .../nlp/parse/TestCollapsedParseTree.scala | 37 - .../nlp/parse/TestTransitionBasedParser.scala | 177 -- .../nlp/segment/TestBigramStatistics.scala | 31 - .../app/nlp/segment/TestLexerTokenizer.scala | 364 --- .../app/nlp/segment/TestPhraseTokenizer.scala | 42 - .../app/nlp/segment/TestRegexTokenizer.scala | 314 --- .../factorie/app/regress/TestRegression.scala | 82 - .../factorie/app/uschema/TestCoocMatrix.scala | 266 -- .../uschema/TestEntityRelationKBMatrix.scala | 170 -- .../app/uschema/TestMatrixIndexMap.scala | 73 - .../uschema/TestUniversalSchemaTrainer.scala | 153 -- .../scala/cc/factorie/directed/TestBeta.scala | 36 - .../factorie/directed/TestDirectedModel.scala | 39 - .../cc/factorie/directed/TestDirichlet.scala | 50 - .../cc/factorie/directed/TestDiscrete.scala | 25 - .../cc/factorie/directed/TestFunction.scala | 33 - .../cc/factorie/directed/TestGaussian.scala | 35 - .../directed/TestMaximizeProportions.scala | 48 - .../directed/TestPlatedDiscrete.scala | 44 - .../cc/factorie/directed/TestPoisson.scala | 33 - src/test/scala/cc/factorie/infer/TestBP.scala | 631 ----- .../scala/cc/factorie/la/TestTensor.scala | 217 -- .../scala/cc/factorie/la/TestTensor2.scala | 75 - .../scala/cc/factorie/maths/TestGamma.scala | 26 - .../scala/cc/factorie/model/TestModel.scala | 27 - .../factorie/model/TestProposalSamplers.scala | 93 - .../cc/factorie/model/TestTemplates.scala | 155 -- .../factorie/optimize/TestDecisionTree.scala | 130 - .../cc/factorie/optimize/TestLearning.scala | 129 - .../cc/factorie/optimize/TestOptimize.scala | 221 -- .../cc/factorie/optimize/TestSampleRank.scala | 93 - .../factorie/optimize/TestSampleRank2.scala | 315 --- .../factorie/util/TestAssignmentSolver.scala | 79 - .../cc/factorie/util/TestCmdOptions.scala | 124 - .../cc/factorie/util/TestDoubleSeq.scala | 61 - .../util/TestEvaluatableClustering.scala | 57 - .../util/TestHyperParameterSearcher.scala | 51 - .../util/TestIntAndDoubleSeqCubbie.scala | 59 - .../util/TestJsonCubbieConverter.scala | 68 - .../cc/factorie/variable/TestBagOfWords.scala | 42 - .../variable/TestCategoricalDomain.scala | 70 - .../TestCategoricalVectorVariable.scala | 41 - .../scala/cc/factorie/variable/TestDiff.scala | 44 - .../variable/TestDiscreteSeqVariable.scala | 43 - .../variable/TestDiscreteVariable.scala | 60 - .../factorie/variable/TestEdgeVariable.scala | 56 - .../cc/factorie/variable/TestEnumDomain.scala | 45 - .../variable/TestFeatureVectorVariable.scala | 56 - .../variable/TestIntegerVariable.scala | 42 - .../variable/TestLabeledVariable.scala | 76 - .../variable/TestMassesVariable.scala | 35 - .../variable/TestProportionsVariable.scala | 71 - .../factorie/variable/TestSpanVariable.scala | 70 - .../variable/TestVectorVariable.scala | 55 - 377 files changed, 4614 insertions(+), 28168 deletions(-) create mode 100644 src/main/scala/cc/factorie/app/nlp/BasicSection.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/DocumentAnnotationPipeline.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/DocumentName.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/DocumentSubstring.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/MutableDocumentAnnotatorMap.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/TokenSpanBuffer.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/TokenSpanCollection.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/TokenSpanList.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/TokenString.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/UnknownDocumentAnnotator.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/coref/AbstractEntity.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/coref/AbstractMention.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/coref/AcronymNounPhraseFinder.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/coref/BaseCorefModel.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/coref/ConjunctionOptions.scala delete mode 100644 src/main/scala/cc/factorie/app/nlp/coref/CorefConllOutput.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/coref/CorefModel.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/coref/CorefSystem.scala delete mode 100644 src/main/scala/cc/factorie/app/nlp/coref/CorefTrainer.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/coref/CorefTrainerOpts.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/coref/DefaultHashMap.scala delete mode 100644 src/main/scala/cc/factorie/app/nlp/coref/DeterministicCoref.scala delete mode 100644 src/main/scala/cc/factorie/app/nlp/coref/DeterministicNamedCoref.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/coref/ForwardCorefBase.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/coref/ForwardCorefTrainerOpts.scala delete mode 100644 src/main/scala/cc/factorie/app/nlp/coref/ImplicitFeatureConjunctionTensor.scala delete mode 100644 src/main/scala/cc/factorie/app/nlp/coref/MentionAlignment.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/coref/MentionBuffer.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/coref/MentionCharacteristics.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/coref/MentionCollection.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/coref/MentionList.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/coref/MentionPairFeatures.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/coref/NerForwardCoref.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/coref/NerPhraseFinder.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/coref/Node.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/coref/ParseForwardCoref.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/coref/PronounFinder.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/coref/PronounSets.scala delete mode 100644 src/main/scala/cc/factorie/app/nlp/coref/StructuredCoreference.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/coref/TokenFreqs.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/coref/TopTokenFrequencies.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/coref/WithinDocCoref.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/coref/WithinDocEntity.scala delete mode 100644 src/main/scala/cc/factorie/app/nlp/embedding/Browse.scala delete mode 100644 src/main/scala/cc/factorie/app/nlp/embedding/CBOW.scala delete mode 100644 src/main/scala/cc/factorie/app/nlp/embedding/SkipGram.scala delete mode 100644 src/main/scala/cc/factorie/app/nlp/embedding/WindowWordEmbedder.scala delete mode 100644 src/main/scala/cc/factorie/app/nlp/embeddings/CBOWEmbeddingModel.scala delete mode 100644 src/main/scala/cc/factorie/app/nlp/embeddings/Distance.scala delete mode 100644 src/main/scala/cc/factorie/app/nlp/embeddings/EmbeddingOpts.scala delete mode 100644 src/main/scala/cc/factorie/app/nlp/embeddings/LiteHogWildTrainer.scala delete mode 100644 src/main/scala/cc/factorie/app/nlp/embeddings/SkipGramEmbeddingModel.scala delete mode 100644 src/main/scala/cc/factorie/app/nlp/embeddings/VocabBuilder.scala delete mode 100644 src/main/scala/cc/factorie/app/nlp/embeddings/WordEmbedding.scala delete mode 100644 src/main/scala/cc/factorie/app/nlp/embeddings/WordEmbeddingModel.scala delete mode 100644 src/main/scala/cc/factorie/app/nlp/embeddings/WordEmbeddingUtils.scala delete mode 100644 src/main/scala/cc/factorie/app/nlp/embeddings/WordVec.scala delete mode 100644 src/main/scala/cc/factorie/app/nlp/hcoref/CanopyPairGenerator.scala delete mode 100644 src/main/scala/cc/factorie/app/nlp/hcoref/CorefModel.scala delete mode 100644 src/main/scala/cc/factorie/app/nlp/hcoref/CorefSampler.scala delete mode 100644 src/main/scala/cc/factorie/app/nlp/hcoref/DebugCoref.scala delete mode 100644 src/main/scala/cc/factorie/app/nlp/hcoref/DebugDiffList.scala delete mode 100644 src/main/scala/cc/factorie/app/nlp/hcoref/DefaultMoveGenerator.scala delete mode 100644 src/main/scala/cc/factorie/app/nlp/hcoref/DeterministicPairGenerator.scala delete mode 100644 src/main/scala/cc/factorie/app/nlp/hcoref/DocEntityCoref.scala delete mode 100644 src/main/scala/cc/factorie/app/nlp/hcoref/DocEntityVars.scala delete mode 100644 src/main/scala/cc/factorie/app/nlp/hcoref/HierarchicalCorefSampler.scala delete mode 100644 src/main/scala/cc/factorie/app/nlp/hcoref/LinkingScorer.scala delete mode 100644 src/main/scala/cc/factorie/app/nlp/hcoref/Move.scala delete mode 100644 src/main/scala/cc/factorie/app/nlp/hcoref/NoSplitMoveGenerator.scala delete mode 100644 src/main/scala/cc/factorie/app/nlp/hcoref/Node.scala delete mode 100644 src/main/scala/cc/factorie/app/nlp/hcoref/NodeCollection.scala delete mode 100644 src/main/scala/cc/factorie/app/nlp/hcoref/NodeTemplates.scala delete mode 100644 src/main/scala/cc/factorie/app/nlp/hcoref/PairGenerator.scala delete mode 100644 src/main/scala/cc/factorie/app/nlp/hcoref/PostSampler.scala delete mode 100644 src/main/scala/cc/factorie/app/nlp/hcoref/TACCoref.scala delete mode 100644 src/main/scala/cc/factorie/app/nlp/hcoref/TrainingObjective.scala delete mode 100644 src/main/scala/cc/factorie/app/nlp/hcoref/Verbosity.scala delete mode 100644 src/main/scala/cc/factorie/app/nlp/hcoref/package.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/lemma/CollapseDigitsTokenLemma.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/lemma/LowercaseTokenLemma.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/lemma/PorterTokenLemma.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/lemma/SimplifyDigitsTokenLemma.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/lemma/WordNetTokenLemma.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/lexicon/CustomStopWords.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/lexicon/Determiner.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/lexicon/GenericLexicon.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/lexicon/LexiconMention.scala delete mode 100644 src/main/scala/cc/factorie/app/nlp/lexicon/Lexicons.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/lexicon/LexiconsProvider.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/lexicon/MutableLexicon.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/lexicon/NumberWords.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/lexicon/PersonPronoun.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/lexicon/PhraseLexicon.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/lexicon/PosessiveDeterminer.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/lexicon/Preposition.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/lexicon/Pronoun.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/lexicon/ProvidedLexicon.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/lexicon/ProvidedTriePhraseLexicon.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/lexicon/SuffixNode.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/lexicon/TriePhraseLexicon.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/lexicon/TrieUnionLexicon.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/lexicon/UnionLexicon.scala delete mode 100644 src/main/scala/cc/factorie/app/nlp/lexicon/iesl/es/IeslSpanishLexicon.scala delete mode 100644 src/main/scala/cc/factorie/app/nlp/lexicon/mandarin/MandarinLexicon.scala delete mode 100644 src/main/scala/cc/factorie/app/nlp/lexicon/package.scala delete mode 100644 src/main/scala/cc/factorie/app/nlp/lexicon/wikipedia/es/WikipediaLexicon.scala delete mode 100644 src/main/scala/cc/factorie/app/nlp/load/LoadACE.scala delete mode 100644 src/main/scala/cc/factorie/app/nlp/load/LoadAPFCoref.scala delete mode 100644 src/main/scala/cc/factorie/app/nlp/load/LoadConll2000.scala delete mode 100644 src/main/scala/cc/factorie/app/nlp/load/LoadConll2002.scala delete mode 100644 src/main/scala/cc/factorie/app/nlp/load/LoadConll2008.scala delete mode 100644 src/main/scala/cc/factorie/app/nlp/load/LoadConll2011.scala delete mode 100644 src/main/scala/cc/factorie/app/nlp/load/LoadConllCoreference.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/load/LoadDirectory.scala delete mode 100644 src/main/scala/cc/factorie/app/nlp/load/LoadGermeval2014.scala delete mode 100644 src/main/scala/cc/factorie/app/nlp/load/LoadHTML.scala delete mode 100644 src/main/scala/cc/factorie/app/nlp/load/LoadNYTimesXML.scala delete mode 100644 src/main/scala/cc/factorie/app/nlp/load/LoadOWPL.scala delete mode 100644 src/main/scala/cc/factorie/app/nlp/load/LoadOntonotes5.scala delete mode 100644 src/main/scala/cc/factorie/app/nlp/load/LoadPlainText.scala delete mode 100644 src/main/scala/cc/factorie/app/nlp/load/LoadReACE.scala delete mode 100644 src/main/scala/cc/factorie/app/nlp/load/LoadWSJMalt.scala delete mode 100644 src/main/scala/cc/factorie/app/nlp/load/LoadWikipediaPlainText.scala delete mode 100644 src/main/scala/cc/factorie/app/nlp/load/TacFileIterator.scala delete mode 100644 src/main/scala/cc/factorie/app/nlp/load/XMLSectionalizer.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/ner/BILOU.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/ner/BilouConllNerChunkAnnotator.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/ner/BilouConllNerDomain.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/ner/BilouConllNerTag.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/ner/BilouOntonotesNerChunkAnnotator.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/ner/BilouOntonotesNerDomain.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/ner/BilouOntonotesNerTag.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/ner/ConllChainNer.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/ner/ConllNerDomain.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/ner/ConllNerLabel.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/ner/ConllNerSpan.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/ner/ConllNerSpanBuffer.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/ner/ConllNerSpanLabel.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/ner/ConllNerTag.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/ner/ConllStackedChainNer.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/ner/LabeledBilouNerTag.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/ner/LabeledBilouOntonotesNerTag.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/ner/LabeledConllNerTag.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/ner/LabeledOntonotesNerTag.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/ner/NerSpan.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/ner/NerSpanBuffer.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/ner/NerSpanLabel.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/ner/NoEmbeddingsOntonotesStackedChainNer.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/ner/NoEmbeddingsStackedChainNer.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/ner/OntonotesChainNer.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/ner/OntonotesEntityTypeDomain.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/ner/OntonotesNerDomain.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/ner/OntonotesNerSpan.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/ner/OntonotesNerSpanBuffer.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/ner/OntonotesNerSpanLabel.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/ner/OntonotesNerTag.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/ner/OntonotesStackedChainNer.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/ner/SpanEncoding.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/ner/TokenSequence.scala delete mode 100644 src/main/scala/cc/factorie/app/nlp/parse/CollapsedParseTree.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/parse/LightweightParseSentence.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/parse/LightweightParseToken.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/parse/NullToken.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/parse/OntonotesTransitionBasedParser.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/parse/ParseDecision.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/parse/ParseState.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/parse/ParseTreeLabel.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/parse/ParseTreeLabelDomain.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/parse/ParserConstants.scala delete mode 100644 src/main/scala/cc/factorie/app/nlp/parse/ProjectiveGraphBasedParser.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/parse/RootToken.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/phrase/AnyNerPhraseFinder.scala delete mode 100644 src/main/scala/cc/factorie/app/nlp/phrase/ChainChunker.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/phrase/ConllEntityType.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/phrase/ConllPhraseEntityType.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/phrase/ConllPhraseFinder.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/phrase/DatePhrase.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/phrase/DatePhraseList.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/phrase/Gender.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/phrase/GenderDomain.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/phrase/HeadTokenOffset.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/phrase/LocatedDate.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/phrase/MentionPhraseNumberLabeler.scala delete mode 100644 src/main/scala/cc/factorie/app/nlp/phrase/NPChunkMentionFinder.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/phrase/NnpPosNounPhraseFinder.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/phrase/NounPhraseEntityTypeLabeler.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/phrase/NounPhraseGenderLabeler.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/phrase/NounPhraseList.scala rename src/main/scala/cc/factorie/app/nlp/phrase/{NounPhraseNumber.scala => NounPhraseNumberLabeler.scala} (58%) create mode 100644 src/main/scala/cc/factorie/app/nlp/phrase/NounPhraseTypeDomain.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/phrase/Number.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/phrase/NumberDomain.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/phrase/OntonotesEntityType.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/phrase/OntonotesPhraseEntityType.scala rename src/main/scala/cc/factorie/app/nlp/phrase/{NounPhraseEntityType.scala => OntonotesPhraseEntityTypeLabeler.scala} (61%) create mode 100644 src/main/scala/cc/factorie/app/nlp/phrase/OntonotesPhraseFinder.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/phrase/ParseAndNerBasedPhraseFinder.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/phrase/ParseBasedMentionList.scala rename src/main/scala/cc/factorie/app/nlp/phrase/{ParseBasedMention.scala => ParseBasedPhraseFinder.scala} (84%) create mode 100644 src/main/scala/cc/factorie/app/nlp/phrase/PhraseGender.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/phrase/PhraseGenderLabeler.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/phrase/PhraseList.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/phrase/PhraseNumber.scala delete mode 100644 src/main/scala/cc/factorie/app/nlp/phrase/PosBasedNounPhraseFinder.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/phrase/VerbPhraseList.scala delete mode 100644 src/main/scala/cc/factorie/app/nlp/pos/ChainPosTagger.scala delete mode 100644 src/main/scala/cc/factorie/app/nlp/pos/CtbChainPosTagger.scala delete mode 100644 src/main/scala/cc/factorie/app/nlp/pos/CtbPosTag.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/pos/LabeledPennPosTag.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/pos/LabeledUniversalPosTag.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/pos/OntoNotesForwardPosTagger.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/pos/PennPosDomain.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/pos/PennPosTag.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/pos/UniversalPosDomain.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/pos/UniversalPosTag.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/relation/ConllPatternBasedRelationFinder.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/relation/OntoNotesPatternBasedRelationFinder.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/relation/PatterRelationPredictor.scala delete mode 100644 src/main/scala/cc/factorie/app/nlp/relation/Relation.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/relation/RelationMentionSeq.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/relation/TACRelation.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/relation/TACRelationList.scala delete mode 100644 src/main/scala/cc/factorie/app/nlp/segment/BigramStatistics.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/segment/BritishToAmerican.scala delete mode 100644 src/main/scala/cc/factorie/app/nlp/segment/ChainChineseWordSegmenter.scala delete mode 100644 src/main/scala/cc/factorie/app/nlp/segment/ChineseSegLabelDomains.scala delete mode 100644 src/main/scala/cc/factorie/app/nlp/segment/DehyphenatingTokenizer.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/segment/DeterministicNormalizingHtmlTokenizer.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/segment/DeterministicNormalizingTokenizer.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/segment/DeterministicTokenizer.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/segment/OntonotesNormalizedTokenString.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/segment/OntonotesTokenNormalizer.scala delete mode 100644 src/main/scala/cc/factorie/app/nlp/segment/PhraseTokenizer.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/segment/PlainNormalizedTokenString.scala create mode 100644 src/main/scala/cc/factorie/app/nlp/segment/PlainTokenNormalizer.scala delete mode 100644 src/main/scala/cc/factorie/app/nlp/segment/PunktSentenceSegmenter.scala delete mode 100644 src/main/scala/cc/factorie/app/nlp/segment/PunktTokenizer.scala rename src/main/scala/cc/factorie/app/nlp/segment/{TokenNormalizer.scala => TokenNormalizer1.scala} (64%) create mode 100644 src/main/scala/cc/factorie/app/nlp/wordnet/Synset.scala delete mode 100644 src/test/resources/conll-ner-input delete mode 100644 src/test/resources/parser-test-input delete mode 100644 src/test/scala/cc/factorie/TestExamples.scala delete mode 100644 src/test/scala/cc/factorie/TestSerialize.scala delete mode 100644 src/test/scala/cc/factorie/TestUtils.scala delete mode 100644 src/test/scala/cc/factorie/app/bib/parser/TestBibtexParser.scala delete mode 100644 src/test/scala/cc/factorie/app/classify/backend/TestClassification.scala delete mode 100644 src/test/scala/cc/factorie/app/classify/backend/TestNaiveBayes.scala delete mode 100644 src/test/scala/cc/factorie/app/mf/TestWSabie.scala delete mode 100644 src/test/scala/cc/factorie/app/nlp/TestCompoundDocumentAnnotator.scala delete mode 100644 src/test/scala/cc/factorie/app/nlp/TestDocumentAnnotatorMap.scala delete mode 100644 src/test/scala/cc/factorie/app/nlp/TestDocumentStore.scala delete mode 100644 src/test/scala/cc/factorie/app/nlp/TokenSpanTests.scala delete mode 100644 src/test/scala/cc/factorie/app/nlp/TokenTests.scala delete mode 100644 src/test/scala/cc/factorie/app/nlp/lexicon/TestLexicon.scala delete mode 100644 src/test/scala/cc/factorie/app/nlp/lexicon/TestTriePhraseLexicon.scala delete mode 100644 src/test/scala/cc/factorie/app/nlp/ner/TestNerTaggers.scala delete mode 100644 src/test/scala/cc/factorie/app/nlp/parse/TestCollapsedParseTree.scala delete mode 100644 src/test/scala/cc/factorie/app/nlp/parse/TestTransitionBasedParser.scala delete mode 100644 src/test/scala/cc/factorie/app/nlp/segment/TestBigramStatistics.scala delete mode 100644 src/test/scala/cc/factorie/app/nlp/segment/TestLexerTokenizer.scala delete mode 100644 src/test/scala/cc/factorie/app/nlp/segment/TestPhraseTokenizer.scala delete mode 100644 src/test/scala/cc/factorie/app/nlp/segment/TestRegexTokenizer.scala delete mode 100644 src/test/scala/cc/factorie/app/regress/TestRegression.scala delete mode 100644 src/test/scala/cc/factorie/app/uschema/TestCoocMatrix.scala delete mode 100644 src/test/scala/cc/factorie/app/uschema/TestEntityRelationKBMatrix.scala delete mode 100644 src/test/scala/cc/factorie/app/uschema/TestMatrixIndexMap.scala delete mode 100644 src/test/scala/cc/factorie/app/uschema/TestUniversalSchemaTrainer.scala delete mode 100644 src/test/scala/cc/factorie/directed/TestBeta.scala delete mode 100644 src/test/scala/cc/factorie/directed/TestDirectedModel.scala delete mode 100644 src/test/scala/cc/factorie/directed/TestDirichlet.scala delete mode 100644 src/test/scala/cc/factorie/directed/TestDiscrete.scala delete mode 100644 src/test/scala/cc/factorie/directed/TestFunction.scala delete mode 100644 src/test/scala/cc/factorie/directed/TestGaussian.scala delete mode 100644 src/test/scala/cc/factorie/directed/TestMaximizeProportions.scala delete mode 100644 src/test/scala/cc/factorie/directed/TestPlatedDiscrete.scala delete mode 100644 src/test/scala/cc/factorie/directed/TestPoisson.scala delete mode 100644 src/test/scala/cc/factorie/infer/TestBP.scala delete mode 100644 src/test/scala/cc/factorie/la/TestTensor.scala delete mode 100644 src/test/scala/cc/factorie/la/TestTensor2.scala delete mode 100644 src/test/scala/cc/factorie/maths/TestGamma.scala delete mode 100644 src/test/scala/cc/factorie/model/TestModel.scala delete mode 100644 src/test/scala/cc/factorie/model/TestProposalSamplers.scala delete mode 100644 src/test/scala/cc/factorie/model/TestTemplates.scala delete mode 100644 src/test/scala/cc/factorie/optimize/TestDecisionTree.scala delete mode 100644 src/test/scala/cc/factorie/optimize/TestLearning.scala delete mode 100644 src/test/scala/cc/factorie/optimize/TestOptimize.scala delete mode 100644 src/test/scala/cc/factorie/optimize/TestSampleRank.scala delete mode 100644 src/test/scala/cc/factorie/optimize/TestSampleRank2.scala delete mode 100644 src/test/scala/cc/factorie/util/TestAssignmentSolver.scala delete mode 100644 src/test/scala/cc/factorie/util/TestCmdOptions.scala delete mode 100644 src/test/scala/cc/factorie/util/TestDoubleSeq.scala delete mode 100644 src/test/scala/cc/factorie/util/TestEvaluatableClustering.scala delete mode 100644 src/test/scala/cc/factorie/util/TestHyperParameterSearcher.scala delete mode 100644 src/test/scala/cc/factorie/util/TestIntAndDoubleSeqCubbie.scala delete mode 100644 src/test/scala/cc/factorie/util/TestJsonCubbieConverter.scala delete mode 100644 src/test/scala/cc/factorie/variable/TestBagOfWords.scala delete mode 100644 src/test/scala/cc/factorie/variable/TestCategoricalDomain.scala delete mode 100644 src/test/scala/cc/factorie/variable/TestCategoricalVectorVariable.scala delete mode 100644 src/test/scala/cc/factorie/variable/TestDiff.scala delete mode 100644 src/test/scala/cc/factorie/variable/TestDiscreteSeqVariable.scala delete mode 100644 src/test/scala/cc/factorie/variable/TestDiscreteVariable.scala delete mode 100644 src/test/scala/cc/factorie/variable/TestEdgeVariable.scala delete mode 100644 src/test/scala/cc/factorie/variable/TestEnumDomain.scala delete mode 100644 src/test/scala/cc/factorie/variable/TestFeatureVectorVariable.scala delete mode 100644 src/test/scala/cc/factorie/variable/TestIntegerVariable.scala delete mode 100644 src/test/scala/cc/factorie/variable/TestLabeledVariable.scala delete mode 100644 src/test/scala/cc/factorie/variable/TestMassesVariable.scala delete mode 100644 src/test/scala/cc/factorie/variable/TestProportionsVariable.scala delete mode 100644 src/test/scala/cc/factorie/variable/TestSpanVariable.scala delete mode 100644 src/test/scala/cc/factorie/variable/TestVectorVariable.scala diff --git a/.gitignore b/.gitignore index d195677..359d35b 100644 --- a/.gitignore +++ b/.gitignore @@ -51,3 +51,4 @@ crashlytics-build.properties fabric.properties /.idea/* +/src/main/scala/worksheet.sc diff --git a/build.sbt b/build.sbt index 468afc7..c96927c 100644 --- a/build.sbt +++ b/build.sbt @@ -11,6 +11,7 @@ val scalaParserV = "1.0.6" val jblasV = "1.2.4" val apacheComsCompressV = "1.15" val apacheComsLangV = "3.6" +val factorieV = "1.2" val scalaLangDeps = Seq( "org.scala-lang.modules" %% "scala-parser-combinators" % scalaParserV, @@ -19,7 +20,8 @@ val scalaLangDeps = Seq( ) val scalaDeps = Seq( - "org.json4s" %% "json4s-jackson" % "3.5.3" + "org.json4s" %% "json4s-jackson" % "3.5.3", +"cc.factorie.app.nlp" % "all-models" % factorieV ) val javaDeps = Seq( @@ -50,4 +52,19 @@ sourceGenerators in Compile += { //Enable this only for local builds - disabled for Travis enablePlugins(JavaAppPackaging) // sbt universal:packageZipTarball -//dockerExposedPorts := Seq(9000) // sbt docker:publishLocal \ No newline at end of file +//dockerExposedPorts := Seq(9000) // sbt docker:publishLocal + +javaOptions in Universal ++= Seq( + // -J params will be added as jvm parameters + "-J-Xmx6g", + "-J-Xms3g" + + // others will be added as app parameters + // "-Dproperty=true", + // "-port=8080", + + // you can access any build setting/task here + //s"-version=${version.value}" +) + +resolvers += "IESL Release" at "http://dev-iesl.cs.umass.edu/nexus/content/groups/public" \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/chain/Lexicons.scala b/src/main/scala/cc/factorie/app/chain/Lexicons.scala index cabc511..6ed07f8 100644 --- a/src/main/scala/cc/factorie/app/chain/Lexicons.scala +++ b/src/main/scala/cc/factorie/app/chain/Lexicons.scala @@ -12,14 +12,15 @@ limitations under the License. */ package cc.factorie.app.chain + import cc.factorie.app.nlp.{Token, TokenSpan} import scala.collection.mutable import scala.collection.mutable.ArrayBuffer import scala.io.BufferedSource -/** Methods of retrieving the lexicons that a token in a document (using the window around the token) or a span matches into - * returns the lexicons names, and the location the token matches into the lexicon (like B-label, I-label, U-label, or L-label) +/** Methods of retrieving the lexicon that a token in a document (using the window around the token) or a span matches into + * returns the lexicon names, and the location the token matches into the lexicon (like B-label, I-label, U-label, or L-label) @author anzaroot */ class Lexicons( val sources : List[(String,BufferedSource)]) { val lexiconMap = mutable.HashMap[String, List[String]]() @@ -50,7 +51,7 @@ class Lexicons( val sources : List[(String,BufferedSource)]) { val key = removeTrail(keyPre.map(_.string).mkString(" ")) if(lexiconMap.contains(key) && (removeTrail(token.string) != "" || (keyPre.head.position < token.position && keyPre.last.position > token.position ))) { lexes = lexiconMap(key).map(locate(token, keyPre) + _) ::: lexes - //println("Found for token: " + token.string + " with key: " + keyPre + " the lexicons: " + lexiconMap(key).mkString(" , ")) + //println("Found for token: " + token.string + " with key: " + keyPre + " the lexicon: " + lexiconMap(key).mkString(" , ")) //println("And phrase: " + phrase.map( _.string ).mkString(" ")) } } diff --git a/src/main/scala/cc/factorie/app/nlp/BasicSection.scala b/src/main/scala/cc/factorie/app/nlp/BasicSection.scala new file mode 100644 index 0000000..e0f14ee --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/BasicSection.scala @@ -0,0 +1,8 @@ +package cc.factorie.app.nlp + +/** + * Created by andrew@andrewresearch.net on 27/10/17. + */ + +/** A simple concrete implementation of Section. */ +class BasicSection(val document:Document, val stringStart:Int, val stringEnd:Int) extends Section diff --git a/src/main/scala/cc/factorie/app/nlp/Document.scala b/src/main/scala/cc/factorie/app/nlp/Document.scala index 8882464..45306fc 100644 --- a/src/main/scala/cc/factorie/app/nlp/Document.scala +++ b/src/main/scala/cc/factorie/app/nlp/Document.scala @@ -1,17 +1,5 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - package cc.factorie.app.nlp + import cc.factorie.app.nlp.coref.WithinDocCoref import cc.factorie.util.{Attr, UniqueId} import cc.factorie.variable.CategoricalVar @@ -19,55 +7,43 @@ import cc.factorie.variable.CategoricalVar import scala.collection.mutable import scala.collection.mutable.ArrayBuffer -/** A portion of the string contents of a Document. - @author Andrew McCallum */ -trait DocumentSubstring { - /** The Document of which this DocumentSubstring is a part. */ - def document: Document - /** The character offset into the Document.string at which this DocumentSubstring begins. */ - def stringStart: Int - /** The character offset into the Document.string at which this DocumentSubstring is over. - In other words, the last character of the DocumentSubstring is Document.string(this.stringEnd-1). */ - def stringEnd: Int - /** The substring of the Document encompassed by this DocumentSubstring. */ - def string: String -} /** A Document holds a String containing the original raw string contents - of a natural language document to be processed. The Document also holds - a sequence of Sections, each of which is delineated by character offsets - into the Document's string, and each of which contains a sequence of Tokens, - Sentences and other TokenSpans which may be annotated. - - Documents may be constructed with their full string contents, or they may - have their string contents augmented by the appendString method. - - Documents also have an optional "name" which can be set by Document.setName. - This is typically used to hold a filename in the file system, or some other similar identifier. - - The Document.stringLength method may be a faster alternative to Document.string.length - when you are in the middle of multiple appendString calls because it will - efficiently use the underlying string buffer length, rather than flushing the buffer - to create a string. - - The canonical sequence of Sections in the Document is available through - the Document.sections method. - - By default the canonical sequence of Sections holds a single Section that covers the - entire string contents of the Document (even as the Document grows). This canonical sequence - of Sections may be modified by the user, but this special all-encompassing Section - instance will always be available as Document.asSection. - - Even though Tokens, Sentences and TokenSpans are really stored in the Sections, - Document has basic convenience methods for obtaining iterable collections of these - by concatenating them from the canonical sequence of Sections. These iterable - collections are of type Iterable[Token], not Seq[Token], however. - If you need the Tokens as a Seq[Token] rather than an Iterable[Token], or you need - more advanced queries for TokenSpan types, you should use methods on a Section, - not on the Document. In this case typical processing looks like: - "for (section <- document.sections) section.tokens.someMethodOnSeq()...". - - @author Andrew McCallum */ + * of a natural language document to be processed. The Document also holds + * a sequence of Sections, each of which is delineated by character offsets + * into the Document's string, and each of which contains a sequence of Tokens, + * Sentences and other TokenSpans which may be annotated. + ** + *Documents may be constructed with their full string contents, or they may + *have their string contents augmented by the appendString method. + ** + *Documents also have an optional "name" which can be set by Document.setName. + *This is typically used to hold a filename in the file system, or some other similar identifier. + ** + *The Document.stringLength method may be a faster alternative to Document.string.length + *when you are in the middle of multiple appendString calls because it will + *efficiently use the underlying string buffer length, rather than flushing the buffer + *to create a string. + ** + *The canonical sequence of Sections in the Document is available through + *the Document.sections method. + ** + *By default the canonical sequence of Sections holds a single Section that covers the + *entire string contents of the Document (even as the Document grows). This canonical sequence + *of Sections may be modified by the user, but this special all-encompassing Section + *instance will always be available as Document.asSection. + ** + *Even though Tokens, Sentences and TokenSpans are really stored in the Sections, + *Document has basic convenience methods for obtaining iterable collections of these + *by concatenating them from the canonical sequence of Sections. These iterable + *collections are of type Iterable[Token], not Seq[Token], however. + *If you need the Tokens as a Seq[Token] rather than an Iterable[Token], or you need + *more advanced queries for TokenSpan types, you should use methods on a Section, + *not on the Document. In this case typical processing looks like: + *"for (section <- document.sections) section.tokens.someMethodOnSeq()...". + ** + * + *@author Andrew McCallum */ class Document extends DocumentSubstring with Attr with UniqueId with Serializable { /** Create a new Document, initializing it to have contents given by the argument. */ def this(stringContents:String) = { this(); _string = stringContents } @@ -75,7 +51,7 @@ class Document extends DocumentSubstring with Attr with UniqueId with Serializab This may be any String, but is typically a filename or other similar identifier. */ def name: String = { val dn = this.attr[DocumentName]; if (dn ne null) dn.string else null } /** Set the value that will be returned by the 'name' method. - It accomplishes this by setting the DocumentName attr on Document. + It accomplishes this by setting the DocumentName attr on Document. If the String argument is null, it will remove DocumentName attr if present. */ def setName(s:String): this.type = { if (s ne null) this.attr += DocumentName(s) else this.attr.remove[DocumentName]; this } /** The unique identifier for this Document, e.g. used for database lookup, etc. @@ -84,7 +60,7 @@ class Document extends DocumentSubstring with Attr with UniqueId with Serializab // One of the following two is always null, the other non-null. The later is used while multiple appendString() method calls are made. private var _string: String = "" private var _stringbuf: StringBuffer = null - + /** Append the string 's' to this Document. @return the length of the Document's string before string 's' was appended. */ def appendString(s:String): Int = this.synchronized { @@ -102,7 +78,7 @@ class Document extends DocumentSubstring with Attr with UniqueId with Serializab } _string } - /** The number of characters in this Document's string. + /** The number of characters in this Document's string. Use this instead of Document.string.length because it is more efficient when the Document's string is growing with appendString. */ def stringLength: Int = if (_string ne null) _string.length else _stringbuf.length @@ -113,19 +89,19 @@ class Document extends DocumentSubstring with Attr with UniqueId with Serializab def stringStart: Int = 0 /** A method required by the DocumentSubstring trait, which in this case simply returns Document.stringLength. */ def stringEnd: Int = stringLength - + // Managing sections. These are the canonical Sections, but alternative Sections can be attached as Attr's. /** A predefined Section that covers the entirety of the Document string, and even grows as the length of this Document may grow. If the user does not explicitly add Sections to the document, this Section is the only one returned by the "sections" method. */ lazy val asSection: Section = new Section { def document: Document = Document.this; def stringStart = 0; def stringEnd = document.stringEnd } private lazy val _sections: mutable.Buffer[Section] = new ArrayBuffer[Section] += asSection - /** The canonical list of Sections containing the tokens of the document. + /** The canonical list of Sections containing the tokens of the document. The user may create and add Sections covering various substrings within the Document. If the user does not explicitly add any Sections, by default there will be one Section that covers the entire Document string; this one Section is the one returned by "Document.asSection". Note that Sections may overlap with each other, representing alternative tokenizations or annotations. */ def sections: Seq[Section] = _sections // if (_sections.length == 0) Seq(asSection) else _sections - /** Add a new Section to this Document's canonical list of Sections. + /** Add a new Section to this Document's canonical list of Sections. If the only previously existing Section is the default (asSection), then remove it before adding the argument. */ def +=(s: Section) = { if (_sections.length == 1 && _sections(0) == asSection) _sections.clear(); _sections += s } /** Remove a Section from this Document's canonical list of Sections. */ @@ -138,14 +114,14 @@ class Document extends DocumentSubstring with Attr with UniqueId with Serializab def tokens: Iterable[Token] = if (sections.length == 1) sections.head.tokens else new Iterable[Token] { def iterator = for (section <- sections.iterator; token <- section.tokens.iterator) yield token } /** Return an Iterable collection of all Sentences in all canonical Sections of this Document. */ def sentences: Iterable[Sentence] = if (sections.length == 1) sections.head.sentences else new Iterable[Sentence] { def iterator = for (section <- sections.iterator; sentence <- section.sentences.iterator) yield sentence } - + /** An efficient way to get the total number of Tokens in the canonical Sections of this Document. */ def tokenCount: Int = if (sections.length == 0) sections.head.length else sections.foldLeft(0)((result, section) => result + section.length) /** An efficient way to get the total number of Sentences in the canonical Sections of this Document. */ def sentenceCount: Int = if (sections.length == 0) sections.head.sentences.length else sections.foldLeft(0)((result, section) => result + section.sentences.length) - + /** The collection of DocumentAnnotators that have been run on this Document, - For keeping records of which DocumentAnnotators have been run on this document, producing which annotations. + For keeping records of which DocumentAnnotators have been run on this document, producing which annotations. A Map from the annotation class to the DocumentAnnotator that produced it, for example from classOf[cc.factorie.app.nlp.pos.PennPos] to classOf[cc.factorie.app.nlp.pos.ChainPosTagger]. Note that this map records annotations placed not just on the Document itself, but also its constituents, @@ -155,20 +131,20 @@ class Document extends DocumentSubstring with Attr with UniqueId with Serializab def hasAnnotation(c:Class[_]): Boolean = annotators.keys.exists(k => c.isAssignableFrom(k)) /** Optionally return the DocumentAnnotator that produced the annotation of class 'c' within this Document. */ def annotatorFor(c:Class[_]): Option[Class[_]] = annotators.keys.find(k => c.isAssignableFrom(k)).collect({case k:Class[_] => annotators(k)}) - -// /** Return a String containing the Token strings in the document, with sentence and span boundaries indicated with SGML. */ -// def sgmlString(spanLists:SpanList[_,_,_]*): String = { -// val buf = new StringBuffer -// for (section <- sections; token <- section.tokens) { -// if (token.isSentenceStart) buf.append("") -// token.startsSpans.foreach(span => buf.append("<"+span.name+">")) -// buf.append(token.string) -// token.endsSpans.foreach(span => buf.append("")) -// if (token.isSentenceEnd) buf.append("") -// buf.append(" ") -// } -// buf.toString -// } + + // /** Return a String containing the Token strings in the document, with sentence and span boundaries indicated with SGML. */ + // def sgmlString(spanLists:SpanList[_,_,_]*): String = { + // val buf = new StringBuffer + // for (section <- sections; token <- section.tokens) { + // if (token.isSentenceStart) buf.append("") + // token.startsSpans.foreach(span => buf.append("<"+span.name+">")) + // buf.append(token.string) + // token.endsSpans.foreach(span => buf.append("")) + // if (token.isSentenceEnd) buf.append("") + // buf.append(" ") + // } + // buf.toString + // } // Common attributes, will return null if not present def coref: WithinDocCoref = this.attr[WithinDocCoref] @@ -177,8 +153,8 @@ class Document extends DocumentSubstring with Attr with UniqueId with Serializab def getCoref: WithinDocCoref = this.attr.getOrElseUpdate[WithinDocCoref](new WithinDocCoref(this)) /** Return the gold-standard WithinDocCoref.target solution for this Document. If not already present create it. */ def getTargetCoref: WithinDocCoref = { val coref = this.getCoref; if (coref.target eq null) coref.target = new WithinDocCoref(this); coref.target } - - /** Return a String containing the Token strings in the document, formatted with one-word-per-line + + /** Return a String containing the Token strings in the document, formatted with one-word-per-line and various tab-separated attributes appended on each line, generated as specified by the argument. */ def owplString(attributes:Iterable[(Token)=>Any]): String = { val buf = new StringBuffer @@ -199,7 +175,7 @@ class Document extends DocumentSubstring with Attr with UniqueId with Serializab } buf.toString } - /** Return a String containing the Token strings in the document, formatted with one-word-per-line + /** Return a String containing the Token strings in the document, formatted with one-word-per-line and various tab-separated attributes appended on each line, generated from the 'annotator.tokenAnnotationString' method. */ def owplString(annotator:DocumentAnnotator): String = annotator match { case pipeline:DocumentAnnotationPipeline => owplString(pipeline.annotators.map(a => a.tokenAnnotationString(_))) @@ -212,20 +188,17 @@ class Document extends DocumentSubstring with Attr with UniqueId with Serializab .find{case(start, end, _) => start <= strStart && end >= strEnd}.map(_._3) } -/** Used as an attribute on Document to hold the document's name. */ -case class DocumentName(string:String) { - override def toString: String = string -} -// TODO Consider removing DocumentCubbie because this implementation is inefficient, + +// TODO Consider removing DocumentCubbie because this implementation is inefficient, // and it isn't sensible that everyone would want the same selection of saved items. -/** A Cubbie for serializing a Document, with separate slots for the Tokens, Sentences, and TokenSpans. +/** A Cubbie for serializing a Document, with separate slots for the Tokens, Sentences, and TokenSpans. Note that it does not yet serialize Sections, and relies on Document.asSection being the only Section. */ //class DocumentCubbie[TC<:TokenCubbie,SC<:SentenceCubbie,TSC<:TokenSpanCubbie](val tc:()=>TC, val sc:()=>SC, val tsc:()=>TSC) extends Cubbie with AttrCubbieSlots { // val name = StringSlot("name") -// val string = StringSlot("string") +// val string = StringSlot("string") // val tokens = CubbieListSlot("tokens", tc) // val sentences = CubbieListSlot("sentences", sc) // val spans = CubbieListSlot("spans", tsc) diff --git a/src/main/scala/cc/factorie/app/nlp/DocumentAnnotationPipeline.scala b/src/main/scala/cc/factorie/app/nlp/DocumentAnnotationPipeline.scala new file mode 100644 index 0000000..902410c --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/DocumentAnnotationPipeline.scala @@ -0,0 +1,36 @@ +package cc.factorie.app.nlp + +/**User: apassos + * Date: 8/7/13 + * Time: 2:48 PM + */ + +/** A sequence of DocumentAnnotators packaged as a single DocumentAnnotator. + This class also properly populates the Document.annotators with a record of which DocumentAnnotator classes provided which annotation classes. */ +class DocumentAnnotationPipeline(val annotators: Seq[DocumentAnnotator], val prereqAttrs: Seq[Class[_]] = Seq()) extends DocumentAnnotator { + var profile = false + var tokensProcessed = 0 + var msProcessed = 0L + val timePerAnnotator = collection.mutable.LinkedHashMap[DocumentAnnotator,Long]() + def postAttrs = annotators.flatMap(_.postAttrs).distinct + def process(document: Document) = { + var doc = document + val t00 = System.currentTimeMillis() + for (annotator <- annotators; if annotator.postAttrs.forall(!doc.hasAnnotation(_))) { + val t0 = System.currentTimeMillis() + doc = annotator.process(doc) + if (profile) timePerAnnotator(annotator) = timePerAnnotator.getOrElse(annotator, 0L) + System.currentTimeMillis() - t0 + annotator.postAttrs.foreach(a => document.annotators(a) = annotator.getClass) + } + if (profile) { + msProcessed += System.currentTimeMillis() - t00 + tokensProcessed += doc.tokenCount + } + doc + } + def profileReport: String = { + s"Processed $tokensProcessed tokens in ${msProcessed/1000.0} seconds, at ${tokensProcessed.toDouble*1000.0/msProcessed} tokens / second " + + "Speeds of individual components:\n" + timePerAnnotator.map(i => f" ${i._1.getClass.getSimpleName}%30s: ${tokensProcessed.toDouble*1000.0/i._2}%4.4f tokens/sec ").mkString("\n") + } + def tokenAnnotationString(token: Token): String = annotators.map(_.tokenAnnotationString(token)).mkString("\t") +} diff --git a/src/main/scala/cc/factorie/app/nlp/DocumentAnnotator.scala b/src/main/scala/cc/factorie/app/nlp/DocumentAnnotator.scala index 0d166a0..7397c56 100644 --- a/src/main/scala/cc/factorie/app/nlp/DocumentAnnotator.scala +++ b/src/main/scala/cc/factorie/app/nlp/DocumentAnnotator.scala @@ -1,17 +1,5 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - package cc.factorie.app.nlp + import cc.factorie.app.nlp.coref.Mention import cc.factorie.app.nlp.phrase.Phrase import cc.factorie.util.Threading @@ -28,46 +16,10 @@ trait DocumentAnnotator { /** How the annotation of this DocumentAnnotator should be printed in one-word-per-line (OWPL) format. If there is no per-token annotation, return null. Used in Document.owplString. */ def tokenAnnotationString(token:Token): String - + /** How the annotation of this DocumentAnnotator should be printed as extra information after a one-word-per-line (OWPL) format. If there is no document annotation, return the empty string. Used in Document.owplString. */ def documentAnnotationString(document:Document): String = "" def phraseAnnotationString(phrase:Phrase): String = "" def mentionAnnotationString(mention:Mention): String = "" } - -/** Used as a stand-in dummy DocumentAnnotator in the DocumentAnnotatorMap when an annotation was added but not by a real DocumentAnnotator. */ -object UnknownDocumentAnnotator extends DocumentAnnotator { - def process(document: Document): Document = document - def prereqAttrs: Iterable[Class[_]] = Nil - def postAttrs: Iterable[Class[_]] = Nil - def tokenAnnotationString(token: Token) = null -} - -object NoopDocumentAnnotator extends DocumentAnnotator { - def process(document: Document): Document = document - def prereqAttrs: Iterable[Class[_]] = Nil - def postAttrs: Iterable[Class[_]] = Nil - def tokenAnnotationString(token: Token) = null -} - -class CompoundDocumentAnnotator(val annos:Seq[DocumentAnnotator]) extends DocumentAnnotator { - // for java compat - def this(annoArr:Array[DocumentAnnotator]) = this(annoArr.toSeq) - def tokenAnnotationString(token: Token) = annos.map(anno => Option(anno.tokenAnnotationString(token))).mkString("\t") - - lazy val prereqAttrs = annos.flatMap(_.prereqAttrs).toSet diff postAttrs - lazy val postAttrs = annos.flatMap(_.postAttrs).toSet - - def process(document: Document) = { - // left fold, but faster, thanks scala - var doc = document - val iter = annos.iterator - while(iter.hasNext) { - val anno = iter.next() - //println(s"annotating document ${doc.name} with ${anno.getClass.getName}") - doc = anno.process(doc) - } - doc - } -} diff --git a/src/main/scala/cc/factorie/app/nlp/DocumentAnnotatorPipeline.scala b/src/main/scala/cc/factorie/app/nlp/DocumentAnnotatorPipeline.scala index 217c8c1..886c56c 100644 --- a/src/main/scala/cc/factorie/app/nlp/DocumentAnnotatorPipeline.scala +++ b/src/main/scala/cc/factorie/app/nlp/DocumentAnnotatorPipeline.scala @@ -1,92 +1,37 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ package cc.factorie.app.nlp import cc.factorie.util.FastLogging import scala.reflect.ClassTag -/**User: apassos - * Date: 8/7/13 - * Time: 2:48 PM - */ -/** A sequence of DocumentAnnotators packaged as a single DocumentAnnotator. - This class also properly populates the Document.annotators with a record of which DocumentAnnotator classes provided which annotation classes. */ -class DocumentAnnotationPipeline(val annotators: Seq[DocumentAnnotator], val prereqAttrs: Seq[Class[_]] = Seq()) extends DocumentAnnotator { - var profile = false - var tokensProcessed = 0 - var msProcessed = 0L - val timePerAnnotator = collection.mutable.LinkedHashMap[DocumentAnnotator,Long]() - def postAttrs = annotators.flatMap(_.postAttrs).distinct - def process(document: Document) = { - var doc = document - val t00 = System.currentTimeMillis() - for (annotator <- annotators; if annotator.postAttrs.forall(!doc.hasAnnotation(_))) { - val t0 = System.currentTimeMillis() - doc = annotator.process(doc) - if (profile) timePerAnnotator(annotator) = timePerAnnotator.getOrElse(annotator, 0L) + System.currentTimeMillis() - t0 - annotator.postAttrs.foreach(a => document.annotators(a) = annotator.getClass) - } - if (profile) { - msProcessed += System.currentTimeMillis() - t00 - tokensProcessed += doc.tokenCount - } - doc - } - def profileReport: String = { - s"Processed $tokensProcessed tokens in ${msProcessed/1000.0} seconds, at ${tokensProcessed.toDouble*1000.0/msProcessed} tokens / second " + - "Speeds of individual components:\n" + timePerAnnotator.map(i => f" ${i._1.getClass.getSimpleName}%30s: ${tokensProcessed.toDouble*1000.0/i._2}%4.4f tokens/sec ").mkString("\n") - } - def tokenAnnotationString(token: Token): String = annotators.map(_.tokenAnnotationString(token)).mkString("\t") -} -/** A Map from annotation class to DocumentAnnotator that provides that annotation. - Used to store default ways of getting certain prerequisite annotations. */ -class MutableDocumentAnnotatorMap extends collection.mutable.LinkedHashMap[Class[_], () => DocumentAnnotator] { - def +=(annotator: DocumentAnnotator) = annotator.postAttrs.foreach(a => this(a) = () => annotator) -} /** A factory for creating DocumentAnnotatorPipelines given requirements about which annotations or which DocumentAnnotators are desired. */ object DocumentAnnotatorPipeline extends FastLogging { + val defaultDocumentAnnotationMap: DocumentAnnotatorMap = new collection.immutable.ListMap ++ Seq( // Note that order matters here - classOf[pos.PennPosTag] -> (() => pos.OntonotesForwardPosTagger), - classOf[parse.ParseTree] -> (() => parse.OntonotesTransitionBasedParser), - classOf[segment.PlainNormalizedTokenString] -> (() => segment.PlainTokenNormalizer), + classOf[cc.factorie.app.nlp.pos.PennPosTag] -> (() => pos.OntonotesForwardPosTagger), + classOf[cc.factorie.app.nlp.parse.ParseTree] -> (() => parse.OntonotesTransitionBasedParser), + classOf[cc.factorie.app.nlp.segment.PlainNormalizedTokenString] -> (() => segment.PlainTokenNormalizer), classOf[Token] -> (() => segment.DeterministicNormalizingTokenizer), classOf[Sentence] -> (() => segment.DeterministicSentenceSegmenter), - classOf[lemma.WordNetTokenLemma] -> (() => lemma.WordNetLemmatizer), - //classOf[lemma.SimplifyDigitsTokenLemma] -> (() => lemma.SimplifyDigitsLemmatizer), - //classOf[lemma.CollapseDigitsTokenLemma] -> (() => lemma.CollapseDigitsLemmatizer), - //classOf[lemma.PorterTokenLemma] -> (() => lemma.PorterLemmatizer), - //classOf[lemma.LowercaseTokenLemma] -> (() => lemma.LowercaseLemmatizer), - classOf[ner.NerTag] -> (() => ner.ConllChainNer), // TODO Should there be a different default? - //classOf[ner.BilouConllNerTag] -> (() => ner.NoEmbeddingsConllStackedChainNer), - classOf[ner.BilouOntonotesNerTag] -> (() => ner.NoEmbeddingsOntonotesStackedChainNer), - //classOf[ner.ConllNerSpanBuffer] -> (() => ner.BilouConllNerChunkAnnotator), - classOf[ner.OntonotesNerSpanBuffer] -> (() => ner.BilouOntonotesNerChunkAnnotator), - //classOf[coref.mention.NerMentionList] -> (() => coref.mention.NerAndPronounMentionFinder), - //classOf[phrase.GenderLabel[coref.Mention]] -> (() => phrase.GenderLabeler[]), - classOf[phrase.Gender] -> (() => phrase.MentionPhraseGenderLabeler), - classOf[phrase.Number] -> (() => phrase.MentionPhraseNumberLabeler), - classOf[phrase.DatePhraseList] -> (() => phrase.DatePhraseFinder), - classOf[coref.WithinDocCoref] -> (() => coref.NerForwardCoref), - classOf[relation.RelationMentionSeq] -> (() => relation.ConllPatternBasedRelationFinder) - //classOf[phrase.NumberLabel[phrase.NounPhrase]] -> (() => phrase.NounPhraseNumberLabeler), - //classOf[MentionEntityType] -> (() => coref.mention.MentionEntityTypeLabeler), - //classOf[cc.factorie.util.coref.GenericEntityMap[coref.mention.Mention]] -> (() => coref.NerForwardCoref) - + classOf[cc.factorie.app.nlp.lemma.WordNetTokenLemma] -> (() => lemma.WordNetLemmatizer), + classOf[cc.factorie.app.nlp.lemma.SimplifyDigitsTokenLemma] -> (() => lemma.SimplifyDigitsLemmatizer), + classOf[cc.factorie.app.nlp.lemma.CollapseDigitsTokenLemma] -> (() => lemma.CollapseDigitsLemmatizer), + classOf[cc.factorie.app.nlp.lemma.PorterTokenLemma] -> (() => lemma.PorterLemmatizer), + classOf[cc.factorie.app.nlp.lemma.LowercaseTokenLemma] -> (() => lemma.LowercaseLemmatizer), + classOf[cc.factorie.app.nlp.ner.NerTag] -> (() => ner.ConllChainNer), // TODO Should there be a different default? + classOf[cc.factorie.app.nlp.ner.BilouConllNerTag] -> (() => ner.NoEmbeddingsConllStackedChainNer), + classOf[cc.factorie.app.nlp.ner.BilouOntonotesNerTag] -> (() => ner.NoEmbeddingsOntonotesStackedChainNer), + classOf[cc.factorie.app.nlp.ner.ConllNerSpanBuffer] -> (() => ner.BilouConllNerChunkAnnotator), + classOf[cc.factorie.app.nlp.ner.OntonotesNerSpanBuffer] -> (() => ner.BilouOntonotesNerChunkAnnotator), + classOf[cc.factorie.app.nlp.phrase.Gender] -> (() => phrase.MentionPhraseGenderLabeler), + classOf[cc.factorie.app.nlp.phrase.Number] -> (() => phrase.MentionPhraseNumberLabeler), + classOf[cc.factorie.app.nlp.phrase.DatePhraseList] -> (() => phrase.DatePhraseFinder), + classOf[cc.factorie.app.nlp.coref.WithinDocCoref] -> (() => coref.NerForwardCoref), + classOf[cc.factorie.app.nlp.relation.RelationMentionSeq] -> (() => relation.ConllPatternBasedRelationFinder) ) //def apply(goal: Class[_]): DocumentAnnotationPipeline = apply(Seq(goal), defaultDocumentAnnotationMap) @@ -153,5 +98,4 @@ object DocumentAnnotatorPipeline extends FastLogging { } } } -} - +} \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/nlp/DocumentName.scala b/src/main/scala/cc/factorie/app/nlp/DocumentName.scala new file mode 100644 index 0000000..1fc7352 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/DocumentName.scala @@ -0,0 +1,6 @@ +package cc.factorie.app.nlp + +/** Used as an attribute on Document to hold the document's name. */ +case class DocumentName(string:String) { + override def toString: String = string +} diff --git a/src/main/scala/cc/factorie/app/nlp/DocumentSubstring.scala b/src/main/scala/cc/factorie/app/nlp/DocumentSubstring.scala new file mode 100644 index 0000000..845182c --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/DocumentSubstring.scala @@ -0,0 +1,16 @@ +package cc.factorie.app.nlp + +/** A portion of the string contents of a Document. + * + *@author Andrew McCallum */ +trait DocumentSubstring { + /** The Document of which this DocumentSubstring is a part. */ + def document: Document + /** The character offset into the Document.string at which this DocumentSubstring begins. */ + def stringStart: Int + /** The character offset into the Document.string at which this DocumentSubstring is over. + In other words, the last character of the DocumentSubstring is Document.string(this.stringEnd-1). */ + def stringEnd: Int + /** The substring of the Document encompassed by this DocumentSubstring. */ + def string: String +} \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/nlp/MutableDocumentAnnotatorMap.scala b/src/main/scala/cc/factorie/app/nlp/MutableDocumentAnnotatorMap.scala new file mode 100644 index 0000000..6d6f283 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/MutableDocumentAnnotatorMap.scala @@ -0,0 +1,7 @@ +package cc.factorie.app.nlp + +/** A Map from annotation class to DocumentAnnotator that provides that annotation. + *Used to store default ways of getting certain prerequisite annotations. */ +class MutableDocumentAnnotatorMap extends collection.mutable.LinkedHashMap[Class[_], () => DocumentAnnotator] { + def +=(annotator: DocumentAnnotator) = annotator.postAttrs.foreach(a => this(a) = () => annotator) +} diff --git a/src/main/scala/cc/factorie/app/nlp/Section.scala b/src/main/scala/cc/factorie/app/nlp/Section.scala index 0c28ade..125fb2f 100644 --- a/src/main/scala/cc/factorie/app/nlp/Section.scala +++ b/src/main/scala/cc/factorie/app/nlp/Section.scala @@ -1,64 +1,57 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - package cc.factorie.app.nlp + import cc.factorie.util.Attr import cc.factorie.variable.Chain import scala.collection.mutable.ArrayBuffer +/** + * Created by andrew@andrewresearch.net on 27/10/17. + */ + + /** A part of a Document, delineated by character offsets into the Document's string, and which can hold a sequence of Tokens and a sequence of Sentences. - + By defining Section in terms of character offsets instead of by Token positions - we gain the ability to (a) split the Document into Sections before tokenization, - (b) run different tokenizers in different sections, (c) even have overlapping + we gain the ability to (a) split the Document into Sections before tokenization, + (b) run different tokenizers in different sections, (c) even have overlapping Sections with alternative tokenization and annotation for the same text. - + The canonical sequence of Sections in a Document is available as Document.sections, but a Document may have multiple overlapping Sections (for example to store alternative tokenizations or wholly distinct sets of annotations in other "non-canonical" Sections, which may be stored by some customized scheme in the Document attributes, Document.attr. - + In addition to their canonical sequence of Sections, all Documents also have a Section that encompasses the entire Document (even if the Document grows in length). This is accessed - via Document.asSection. This is the sole member of the initialized default Document.sections, but + via Document.asSection. This is the sole member of the initialized default Document.sections, but be cautious about always using Document.asSection to get the Documents Tokens, sentences and their annotations, because some other processing may reset the canonical sequence of Sections to some other collection. - + If you want to tokenize first and then split a Document into Sections, you can tokenize into Document.asSection, and then create new canonical Section at your desired boundaries, - and then re-tokenize each Section. (In the future we may provide a way to avoid the - computation of re-tokenizing.) - + and then re-tokenize each Section. (In the future we may provide a way to avoid the + computation of re-tokenizing.) + @author Andrew McCallum */ trait Section extends Chain[Section,Token] with DocumentSubstring with Attr { /** The sub-string of the Document string encompassed by this Section. - Note that the returned string will not include any Token.string substitutions - (e.g. WSJ normalization of quotation styles or de-hyphenation, typically implemented using TokenString in the Token.attr) + Note that the returned string will not include any Token.string substitutions + (e.g. WSJ normalization of quotation styles or de-hyphenation, typically implemented using TokenString in the Token.attr) from the Document's original raw string */ def string: String = document.string.substring(stringStart, stringEnd) - + /** Return the 0-based position of this Section in its Document's list of Sections. Because document.sections is easily mutable, this method does a linear search, so don't call it in an inner loop. */ def indexInDocument:Int = document.sections.indexWhere(_ == this) - + /** The sequence of Tokens inside this Section. This method is just a convenient alias for Chain.links. */ def tokens: IndexedSeq[Token] = links /** Find the Token the encompasses the character at "charOffset" beyond the start of this Section's string. */ def tokenAtCharOffset(charOffset:Int): Option[Token] = links.find(token => token.stringStart <= charOffset && token.stringEnd > charOffset) - + // Managing Sentences private var _sentences = new ArrayBuffer[Sentence] /** The sequence of Sentences in this Section. Sentences can be added by Section.+=(Sentence), @@ -85,8 +78,5 @@ trait Section extends Chain[Section,Token] with DocumentSubstring with Attr { case toks if toks.size != 0 => Some(new TokenSpan(toks)) case _ => None } - -} -/** A simple concrete implementation of Section. */ -class BasicSection(val document:Document, val stringStart:Int, val stringEnd:Int) extends Section +} diff --git a/src/main/scala/cc/factorie/app/nlp/Sentence.scala b/src/main/scala/cc/factorie/app/nlp/Sentence.scala index f4d5249..5d5dff3 100644 --- a/src/main/scala/cc/factorie/app/nlp/Sentence.scala +++ b/src/main/scala/cc/factorie/app/nlp/Sentence.scala @@ -1,20 +1,12 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - package cc.factorie.app.nlp +import cc.factorie.app.nlp.parse.ParseTree import cc.factorie.app.nlp.pos.PennPosTag +/** + * Created by andrew@andrewresearch.net on 27/10/17. + */ + /** A span of Tokens making up a sentence within a Section of a Document. A Sentence is a special case of a TokenSpan, stored in its Section, and available through the Section.sentences method. From the Sentence you can get its sequence of Tokens, the Section that contains it, and the Document that contains it. @@ -34,7 +26,7 @@ class Sentence(sec:Section, initialStart:Int, initialLength:Int) // Initialization // removed for efficiency -- shouldn't we do this in the annotators / loaders ? -// if (!sec.document.annotators.contains(classOf[Sentence])) sec.document.annotators(classOf[Sentence]) = UnknownDocumentAnnotator.getClass + // if (!sec.document.annotators.contains(classOf[Sentence])) sec.document.annotators(classOf[Sentence]) = UnknownDocumentAnnotator.getClass sec.addSentence(this) private val _indexInSection: Int = sec.sentences.length - 1 @@ -42,48 +34,17 @@ class Sentence(sec:Section, initialStart:Int, initialLength:Int) def indexInSection: Int = _indexInSection /** Returns true if the given Token is inside this Sentence. */ - def contains(element:Token) = tokens.contains(element) // TODO Re-implement this to be faster avoiding search using token.stringStart bounds + def contains(element:Token) = tokens.contains(element) // TODO Re-implement this to be faster avoiding search using token.stringStart bounds // Parse attributes /** If this Sentence has a ParseTree, return it; otherwise return null. */ - def parse = attr[cc.factorie.app.nlp.parse.ParseTree] + def parse = attr[ParseTree] /** Return the Token at the root of this Sentence's ParseTree. Will throw an exception if there is no ParseTree. */ - def parseRootChild: Token = attr[cc.factorie.app.nlp.parse.ParseTree].rootChild + def parseRootChild: Token = attr[ParseTree].rootChild // common labels /** Returns the sequence of PennPosTags attributed to the sequence of Tokens in this Sentence. */ - def posTags: IndexedSeq[pos.PennPosTag] = tokens.map(_.attr[PennPosTag]) + def posTags: IndexedSeq[cc.factorie.app.nlp.pos.PennPosTag] = tokens.map(_.attr[PennPosTag]) /** Returns the sequence of NerTags attributed to the sequence of Tokens in this Sentence. */ - def nerTags: IndexedSeq[ner.NerTag] = tokens.map(_.nerTag) -} - - -// Cubbie storage - -class SentenceCubbie extends TokenSpanCubbie { - def finishStoreSentence(s:Sentence): Unit = {} - def storeSentence(s:Sentence): this.type = { - storeTokenSpan(s) // also calls finishStoreTokenSpan(s) - finishStoreSentence(s) - this - } - def finishFetchSentence(s:Sentence): Unit = finishFetchTokenSpan(s) - def fetchSentence(section:Section): Sentence = { - val s = new Sentence(section, start.value, length.value) - finishFetchSentence(s) - s - } -} - -// To save the sentence with its parse tree use "new SentenceCubbie with SentenceParseTreeCubbie" -trait SentenceParseCubbie extends SentenceCubbie { - val parse = CubbieSlot("parse", () => new cc.factorie.app.nlp.parse.ParseTreeCubbie) - override def finishStoreSentence(s:Sentence): Unit = { - super.finishStoreSentence(s) - parse := parse.constructor().storeParseTree(s.parse) - } - override def finishFetchSentence(s:Sentence): Unit = { - super.finishFetchSentence(s) - s.attr += parse.value.fetchParseTree(s) - } + def nerTags: IndexedSeq[cc.factorie.app.nlp.ner.NerTag] = tokens.map(_.nerTag) } diff --git a/src/main/scala/cc/factorie/app/nlp/Token.scala b/src/main/scala/cc/factorie/app/nlp/Token.scala index 7b657d4..ea9f8dc 100644 --- a/src/main/scala/cc/factorie/app/nlp/Token.scala +++ b/src/main/scala/cc/factorie/app/nlp/Token.scala @@ -1,22 +1,18 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - package cc.factorie.app.nlp -import cc.factorie.util.{Attr, Cubbie} -import cc.factorie.variable.{CategoricalValue, ChainLink, StringVariable} + +import cc.factorie.app.nlp.lemma.TokenLemma +import cc.factorie.app.nlp.ner.NerTag +import cc.factorie.app.nlp.parse.{ParseTree, ParseTreeLabel} +import cc.factorie.app.nlp.pos.PosTag +import cc.factorie.util.Attr +import cc.factorie.variable.{CategoricalValue, ChainLink} import scala.collection.mutable -import cc.factorie.app.nlp.ner.BioConllNerTag + +/** + * Created by andrew@andrewresearch.net on 27/10/17. + */ + // There are two ways to create Tokens and add them to Sentences and/or Documents: // Without String arguments, in which case the string is assumed to already be in the Document @@ -28,19 +24,19 @@ import cc.factorie.app.nlp.ner.BioConllNerTag Token constructors that include a Sentence automatically add the Token to the Sentence and its Section. Token constructors that include a tokenString automatically append the tokenString to the Document's string. @param stringStart The offset into the Document string of the first character of the Token. - @param stringEnd The offset into the Document string of the character immediately after the last character of the Token. */ +@param stringEnd The offset into the Document string of the character immediately after the last character of the Token. */ class Token(val stringStart:Int, val stringEnd:Int) extends cc.factorie.app.chain.Observation[Token] with ChainLink[Token,Section] with DocumentSubstring with Attr with Serializable { assert(stringStart <= stringEnd) -// override def _setChainPosition(c:Section, p:Int): Unit = { -// super._setChainPosition(c, p) -// assert(stringStart < section.stringEnd && stringStart >= section.stringStart && stringEnd <= section.stringEnd) -// } + // override def _setChainPosition(c:Section, p:Int): Unit = { + // super._setChainPosition(c, p) + // assert(stringStart < section.stringEnd && stringStart >= section.stringStart && stringEnd <= section.stringEnd) + // } /** Create a Token and also append it to the list of Tokens in the Section. There must not already be Tokens in the document with higher stringStart indices. Note that the start and end indices are character offsets into the Document string, not the Section string. @param stringStart The offset into the Document string of the first character of the Token. - @param stringEnd The offset into the Document string of the character immediately after the last character of the Token. */ + @param stringEnd The offset into the Document string of the character immediately after the last character of the Token. */ def this(sec:Section, stringStart:Int, stringEnd:Int) = { this(stringStart, stringEnd) assert(sec ne null) @@ -73,7 +69,7 @@ class Token(val stringStart:Int, val stringEnd:Int) /** The Document containing this Token's Section. */ def document: Document = chain.document /** Return the substring of the original Document string covered by the character indices stringStart to stringEnd. - This may be different than the String returned by this.string if the TokenString attribute has been set. + This may be different than the String returned by this.string if the TokenString attribute has been set. (Such substitutions are useful for de-hyphenation, downcasing, and other such modifications. */ def docSubstring = document.string.substring(stringStart, stringEnd) /** Return the string contents of this Token, either from its attr[TokenString] variable or, if unset, directly as a substring of the Document */ @@ -83,7 +79,7 @@ class Token(val stringStart:Int, val stringEnd:Int) /** Return the string contents of this Token, either from its specified attr[C], or if unset, directly as a substring of the Document. */ def normalizedString[C<:TokenString](attrClass:Class[C]): String = { val ts = attr(attrClass); if (ts ne null) ts.value else docSubstring } /** Return the lemma of the string contents of the Token, either from its attr[TokenLemma] variable or,if unset, from token.string. */ - def lemmaString: String = { val tl = attr[cc.factorie.app.nlp.lemma.TokenLemma]; if (tl ne null) tl.value else string } + def lemmaString: String = { val tl = attr[TokenLemma]; if (tl ne null) tl.value else string } /** Return the 0-start index of this token in its Section. */ def positionInSection: Int = position // TODO The ClearSegmenter should set Token._sentence, so the "sentence" method doesn't have to search for it. -akm @@ -111,21 +107,21 @@ class Token(val stringStart:Int, val stringEnd:Int) } // Common attributes, will return null if not present - def posTag = attr[cc.factorie.app.nlp.pos.PosTag] - def nerTag = attr[cc.factorie.app.nlp.ner.NerTag] - def lemma = attr[cc.factorie.app.nlp.lemma.TokenLemma] + def posTag = attr[PosTag] + def nerTag = attr[NerTag] + def lemma = attr[TokenLemma] // Parse attributes, will throw exception if parse is not present - def parse = sentence.attr[cc.factorie.app.nlp.parse.ParseTree] - def parseParent: Token = sentence.attr[cc.factorie.app.nlp.parse.ParseTree].parent(positionInSentence) - def parseParentIndex: Int = sentence.attr[cc.factorie.app.nlp.parse.ParseTree].parentIndex(positionInSentence) - def parseLabel: cc.factorie.app.nlp.parse.ParseTreeLabel = sentence.attr[cc.factorie.app.nlp.parse.ParseTree].label(positionInSentence) - def parseChildren: Seq[Token] = sentence.attr[cc.factorie.app.nlp.parse.ParseTree].children(positionInSentence) - def parseLeftChildren: Seq[Token] = sentence.attr[cc.factorie.app.nlp.parse.ParseTree].leftChildren(positionInSentence) - def parseRightChildren: Seq[Token] = sentence.attr[cc.factorie.app.nlp.parse.ParseTree].rightChildren(positionInSentence) - def parseChildrenLabeled(label:CategoricalValue[String]): Seq[Token] = sentence.attr[cc.factorie.app.nlp.parse.ParseTree].childrenLabeled(positionInSentence, label.intValue) - def parseLeftChildrenLabeled(label:CategoricalValue[String]): Seq[Token] = sentence.attr[cc.factorie.app.nlp.parse.ParseTree].leftChildrenLabeled(positionInSentence, label.intValue) - def parseRightChildrenLabeled(label:CategoricalValue[String]): Seq[Token] = sentence.attr[cc.factorie.app.nlp.parse.ParseTree].rightChildrenLabeled(positionInSentence, label.intValue) - + def parse = sentence.attr[ParseTree] + def parseParent: Token = sentence.attr[ParseTree].parent(positionInSentence) + def parseParentIndex: Int = sentence.attr[ParseTree].parentIndex(positionInSentence) + def parseLabel: ParseTreeLabel = sentence.attr[ParseTree].label(positionInSentence) + def parseChildren: Seq[Token] = sentence.attr[ParseTree].children(positionInSentence) + def parseLeftChildren: Seq[Token] = sentence.attr[ParseTree].leftChildren(positionInSentence) + def parseRightChildren: Seq[Token] = sentence.attr[ParseTree].rightChildren(positionInSentence) + def parseChildrenLabeled(label:CategoricalValue[String]): Seq[Token] = sentence.attr[ParseTree].childrenLabeled(positionInSentence, label.intValue) + def parseLeftChildrenLabeled(label:CategoricalValue[String]): Seq[Token] = sentence.attr[ParseTree].leftChildrenLabeled(positionInSentence, label.intValue) + def parseRightChildrenLabeled(label:CategoricalValue[String]): Seq[Token] = sentence.attr[ParseTree].rightChildrenLabeled(positionInSentence, label.intValue) + // Sentence methods private[nlp] var _sentence: Sentence = null // This must be changeable from outside because sometimes Tokenization comes before Sentence segmentation def sentence: Sentence = { @@ -139,19 +135,19 @@ class Token(val stringStart:Int, val stringEnd:Int) def isInSentence: Boolean = sentence ne null def isSentenceStart: Boolean = (sentence ne null) && sentence.start == position def isSentenceEnd: Boolean = (sentence ne null) && sentence.end-1 == position - + // Span methods. Don't delete these yet. Still small chance may have a canonical "SpanList" in Section. -// def inSpan: Boolean = chain.hasSpanContaining(position) -// def inSpanOfClass[A<:TokenSpan](c:Class[A]): Boolean = chain.hasSpanOfClassContaining(c, position) -// def inSpanOfClass[A<:TokenSpan:ClassTag]: Boolean = chain.hasSpanOfClassContaining(m.erasure.asInstanceOf[Class[A]], position) -// def spans:Seq[TokenSpan] = chain.spansContaining(position) //.toList -// def spansOfClass[A<:TokenSpan](c:Class[A]) = chain.spansOfClassContaining(c, position) -// def spansOfClass[A<:TokenSpan:ClassTag] = chain.spansOfClassContaining(m.erasure.asInstanceOf[Class[A]], position) -// def startsSpans: Iterable[TokenSpan] = chain.spansStartingAt(position) -// def startsSpansOfClass[A<:TokenSpan:ClassTag]: Iterable[A] = chain.spansOfClassStartingAt(position) -// def endsSpans: Iterable[TokenSpan] = chain.spansEndingAt(position) -// def endsSpansOfClass[A<:TokenSpan:ClassTag]: Iterable[A] = chain.spansOfClassEndingAt(position) - + // def inSpan: Boolean = chain.hasSpanContaining(position) + // def inSpanOfClass[A<:TokenSpan](c:Class[A]): Boolean = chain.hasSpanOfClassContaining(c, position) + // def inSpanOfClass[A<:TokenSpan:ClassTag]: Boolean = chain.hasSpanOfClassContaining(m.erasure.asInstanceOf[Class[A]], position) + // def spans:Seq[TokenSpan] = chain.spansContaining(position) //.toList + // def spansOfClass[A<:TokenSpan](c:Class[A]) = chain.spansOfClassContaining(c, position) + // def spansOfClass[A<:TokenSpan:ClassTag] = chain.spansOfClassContaining(m.erasure.asInstanceOf[Class[A]], position) + // def startsSpans: Iterable[TokenSpan] = chain.spansStartingAt(position) + // def startsSpansOfClass[A<:TokenSpan:ClassTag]: Iterable[A] = chain.spansOfClassStartingAt(position) + // def endsSpans: Iterable[TokenSpan] = chain.spansEndingAt(position) + // def endsSpansOfClass[A<:TokenSpan:ClassTag]: Iterable[A] = chain.spansOfClassEndingAt(position) + // String feature help: def matches(t2:Token): Boolean = string == t2.string // TODO Consider renaming "stringMatches" /** Return true if the first character of the word is upper case. */ @@ -166,7 +162,7 @@ class Token(val stringStart:Int, val stringEnd:Int) def isDigits: Boolean = string.matches("\\d+") /** Return true if the word contains at least one digit. */ def containsDigit: Boolean = string.matches(".*\\d.*") - /** Return a string that captures the generic "shape" of the original word, + /** Return a string that captures the generic "shape" of the original word, mapping lowercase alphabetics to 'a', uppercase to 'A', digits to '1', whitespace to ' '. Skip more than 'maxRepetitions' of the same character class. */ def wordShape(maxRepetitions:Int = 2): String = cc.factorie.app.strings.stringShape(string, maxRepetitions) @@ -184,70 +180,4 @@ class Token(val stringStart:Int, val stringEnd:Int) If instead you want the string contents of the token use the method "string". */ override def toString = "Token("+stringStart+":"+string+")" -} - -/** Used as an attribute of Token when the token.string should return something - different than the document.string.substring at the Token's start and end positions. - For example, de-hyphenation may change "probab\n-ly" to "probably". */ -class TokenString(val token:Token, s:String) extends StringVariable(s) - - - -// Cubbie storage - -class TokenCubbie extends Cubbie { - val start = IntSlot("start") - val end = IntSlot("end") - def postFetchToken(t:Token): Unit = {} - def fetchToken: Token = { - val t = new Token(start.value, end.value) - postFetchToken(t) - t - } - def postStoreToken(t:Token): Unit = {} - def storeToken(t:Token): this.type = { - start := t.stringStart - end := t.stringEnd - postStoreToken(t) - this - } -} - -trait TokenStringCubbieSlot extends TokenCubbie { - val string = StringSlot("string") - override def postStoreToken(t:Token): Unit = { - super.postStoreToken(t) - string := t.string - } - // No postFetchToken necessary because "string" isn't needed for Token initialization -} - -trait TokenBioConllNerTagCubbie extends TokenCubbie { - val ner = StringSlot("ner") - def newTokenNerLabel(t:Token, s:String): BioConllNerTag - override def storeToken(t:Token): this.type = { - super.storeToken(t) - ner := t.nerTag.categoryValue - this - } - override def fetchToken: Token = { - val t = super.fetchToken - t.attr += newTokenNerLabel(t, ner.value) - t - } -} - -trait TokenPennPosTagCubbie extends TokenCubbie { - val pos = StringSlot("pos") - def newTokenPosLabel(t:Token, s:String): cc.factorie.app.nlp.pos.PennPosTag - override def storeToken(t:Token): this.type = { - super.storeToken(t) - pos:= t.posTag.categoryValue - this - } - override def fetchToken: Token = { - val t = super.fetchToken - t.attr += newTokenPosLabel(t, pos.value) - t - } -} +} \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/nlp/TokenSpan.scala b/src/main/scala/cc/factorie/app/nlp/TokenSpan.scala index 458bca0..dfedd79 100644 --- a/src/main/scala/cc/factorie/app/nlp/TokenSpan.scala +++ b/src/main/scala/cc/factorie/app/nlp/TokenSpan.scala @@ -1,22 +1,11 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - package cc.factorie.app.nlp -import cc.factorie.util.{Attr, Cubbie} -import cc.factorie.variable._ + +import cc.factorie.util.Attr +import cc.factorie.variable.SpanVariable import scala.collection.mutable + /** A sub-sequence of Tokens within a Section (which is in turn part of a Document). */ class TokenSpan(theSection:Section, initialStart:Int, initialLength:Int) extends SpanVariable[Section,Token](theSection, initialStart, initialLength) with Attr with Ordered[TokenSpan] with Serializable { @@ -30,7 +19,7 @@ class TokenSpan(theSection:Section, initialStart:Int, initialLength:Int) /** The Sentence to which the first Token in this TokenSpan belongs. */ def sentence = tokens(0).sentence // TODO Implement something like this? def containsSentenceIndex(i:Int): Boolean // Does this TokenSpan contain the token in the ith position of the sentence containing this TokenSpan. - + /** Return the substring of the Document covered by this TokenSpan. If this is a multi-Token TokenSpan, this will include all original characters in the Document, including those skipped by tokenization. */ def documentString: String = document.string.substring(tokens.head.stringStart, tokens.last.stringEnd) // TODO Handle Token.attr[TokenString] changes @@ -56,13 +45,13 @@ class TokenSpan(theSection:Section, initialStart:Int, initialLength:Int) override def toString = "TokenSpan("+start+","+end+":"+this.string+")" /** - * Returns the character offsets of this TokenSpan into the raw text of its original document. - */ + * Returns the character offsets of this TokenSpan into the raw text of its original document. + */ def characterOffsets:(Int, Int) = this.head.stringStart -> this.last.stringEnd /** - * Returns a sequence of tokens that contains @param size tokens before and after the tokenspan. - */ + * Returns a sequence of tokens that contains @param size tokens before and after the tokenspan. + */ def contextWindow(size:Int):Seq[Token] = { var idx = 0 var window = mutable.ArrayBuffer[Token]() @@ -84,8 +73,8 @@ class TokenSpan(theSection:Section, initialStart:Int, initialLength:Int) } /** - * Returns an iterable over tokens before and after the token span without preserving order - */ + * Returns an iterable over tokens before and after the token span without preserving order + */ def contextBag(size:Int):Iterable[Token] = { var idx = 0 var window = mutable.ArrayBuffer[Token]() @@ -107,8 +96,8 @@ class TokenSpan(theSection:Section, initialStart:Int, initialLength:Int) /** - * Implements ordering between two tokenspans, assumed to share the same document - */ + * Implements ordering between two tokenspans, assumed to share the same document + */ def compare(other: TokenSpan): Int = if(this.section.head.stringStart > other.section.head.stringStart) { 1 } else if(this.section.head.stringStart < other.section.head.stringStart) { @@ -134,68 +123,4 @@ class TokenSpan(theSection:Section, initialStart:Int, initialLength:Int) } } } -} -trait TokenSpanCollection[S<:TokenSpan] extends SpanVarCollection[S, Section, Token] - - -/** An immutable collection of TokenSpans, with various methods to returns filtered sub-sets of spans based on position and class. */ -class TokenSpanList[S<:TokenSpan](spans:Iterable[S]) extends SpanVarList[S, Section, Token](spans) with TokenSpanCollection[S] - -/** A mutable collection of TokenSpans, with various methods to returns filtered sub-sets of spans based on position and class. */ -class TokenSpanBuffer[S<:TokenSpan] extends SpanVarBuffer[S, Section, Token] with TokenSpanCollection[S] with Serializable - -// Cubbie storage - -class TokenSpanCubbie extends Cubbie { - val start = IntSlot("start") - val length = IntSlot("length") - def storeTokenSpan(ts:TokenSpan): this.type = { - start := ts.start - length := ts.length - finishStoreTokenSpan(ts) - this - } - def finishStoreTokenSpan(ts:TokenSpan): Unit = {} - def fetchTokenSpan(section:Section): TokenSpan = { - val ts = new TokenSpan(section, start.value, length.value) - finishFetchTokenSpan(ts) - ts - } - def finishFetchTokenSpan(ts:TokenSpan): Unit = {} -} - -trait TokenSpanWithPhraseCubbie extends TokenSpanCubbie { - val phrase = StringSlot("phrase") - override def finishStoreTokenSpan(ts:TokenSpan): Unit = { - super.finishStoreTokenSpan(ts) - phrase := ts.string - } -} - -//trait TokenSpanWithDocRefCubbie[DC<:DocumentCubbie[_,_,_]] extends TokenSpanCubbie { -// def newDocumentCubbie: DC -// val doc = RefSlot("doc", ()=>newDocumentCubbie) -// override def finishStoreTokenSpan(ts:TokenSpan): Unit = { -// super.finishStoreTokenSpan(ts) -// doc := ts.document.name -// } -// def fetchTokenSpan(/* implicit cr:CubbieRefs */): TokenSpan = { -// throw new Error("Not yet implemented") -// val ts = new TokenSpan(null, start.value, length.value) -// finishFetchTokenSpan(ts) -// ts -// } -//} -// -//trait TokenSpanNerLabelCubbieSlot extends TokenSpanCubbie { -// def newTokenSpanNerLabel(ts:TokenSpan, s:String): cc.factorie.app.nlp.ner.NerSpanLabel -// val ner = StringSlot("ner") -// override def finishStoreTokenSpan(ts:TokenSpan): Unit = { -// super.finishStoreTokenSpan(ts) -// ner := ts.attr[cc.factorie.app.nlp.ner.NerSpanLabel].categoryValue -// } -// override def finishFetchTokenSpan(ts:TokenSpan): Unit = { -// super.finishFetchTokenSpan(ts) -// ts.attr += newTokenSpanNerLabel(ts, ner.value) -// } -//} +} \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/nlp/TokenSpanBuffer.scala b/src/main/scala/cc/factorie/app/nlp/TokenSpanBuffer.scala new file mode 100644 index 0000000..17829d8 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/TokenSpanBuffer.scala @@ -0,0 +1,6 @@ +package cc.factorie.app.nlp + +import cc.factorie.variable.SpanVarBuffer + +/** A mutable collection of TokenSpans, with various methods to returns filtered sub-sets of spans based on position and class. */ +class TokenSpanBuffer[S<:TokenSpan] extends SpanVarBuffer[S, Section, Token] with TokenSpanCollection[S] with Serializable \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/nlp/TokenSpanCollection.scala b/src/main/scala/cc/factorie/app/nlp/TokenSpanCollection.scala new file mode 100644 index 0000000..cc49d71 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/TokenSpanCollection.scala @@ -0,0 +1,5 @@ +package cc.factorie.app.nlp + +import cc.factorie.variable.SpanVarCollection + +trait TokenSpanCollection[S<:TokenSpan] extends SpanVarCollection[S, Section, Token] \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/nlp/TokenSpanList.scala b/src/main/scala/cc/factorie/app/nlp/TokenSpanList.scala new file mode 100644 index 0000000..769df1b --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/TokenSpanList.scala @@ -0,0 +1,6 @@ +package cc.factorie.app.nlp + +import cc.factorie.variable.SpanVarList + +/** An immutable collection of TokenSpans, with various methods to returns filtered sub-sets of spans based on position and class. */ +class TokenSpanList[S<:TokenSpan](spans:Iterable[S]) extends SpanVarList[S, Section, Token](spans) with TokenSpanCollection[S] diff --git a/src/main/scala/cc/factorie/app/nlp/TokenString.scala b/src/main/scala/cc/factorie/app/nlp/TokenString.scala new file mode 100644 index 0000000..defcd27 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/TokenString.scala @@ -0,0 +1,9 @@ +package cc.factorie.app.nlp + +import cc.factorie.variable.StringVariable + +/** Used as an attribute of Token when the token.string should return something + * different than the document.string.substring at the Token's start and end positions. + * For example, de-hyphenation may change "probab\n-ly" to "probably". */ +class TokenString(val token:Token, s:String) extends StringVariable(s) + diff --git a/src/main/scala/cc/factorie/app/nlp/UnknownDocumentAnnotator.scala b/src/main/scala/cc/factorie/app/nlp/UnknownDocumentAnnotator.scala new file mode 100644 index 0000000..7347522 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/UnknownDocumentAnnotator.scala @@ -0,0 +1,9 @@ +package cc.factorie.app.nlp + +/** Used as a stand-in dummy DocumentAnnotator in the DocumentAnnotatorMap when an annotation was added but not by a real DocumentAnnotator. */ +object UnknownDocumentAnnotator extends DocumentAnnotator { + def process(document: Document): Document = document + def prereqAttrs: Iterable[Class[_]] = Nil + def postAttrs: Iterable[Class[_]] = Nil + def tokenAnnotationString(token: Token) = null +} diff --git a/src/main/scala/cc/factorie/app/nlp/coref/AbstractEntity.scala b/src/main/scala/cc/factorie/app/nlp/coref/AbstractEntity.scala new file mode 100644 index 0000000..45e2034 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/coref/AbstractEntity.scala @@ -0,0 +1,12 @@ +package cc.factorie.app.nlp.coref + +/** An "entity" in an entity resolution problem. + A non-leaf Node in a coreference hierarchy. + It could be a root (entity) or an intermediate node (sub-entity in hierarchical coref). + This is the super-trait for entities in both within-document coreference and cross-document entity resolution. + @author Andrew McCallum */ +trait AbstractEntity extends Node { + def children: Iterable[Node] // Immediate children + def childIds: Iterable[String] = children.map(_.uniqueId) + def mentions: Iterable[AbstractMention] // Leaves of tree +} \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/nlp/coref/AbstractMention.scala b/src/main/scala/cc/factorie/app/nlp/coref/AbstractMention.scala new file mode 100644 index 0000000..54a2852 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/coref/AbstractMention.scala @@ -0,0 +1,13 @@ +package cc.factorie.app.nlp.coref + +/** A "mention" of an entity in a resolution problem. + A leaf in a coreference hierarchy. + This is the super-trait for mentions in both within-document coreference and cross-document entity resolution. + @author Andrew McCallum */ +trait AbstractMention extends Node { + def parent: ParentType + /** The root of the coreference tree in which this mention is a leaf. */ + def entity: ParentType + /** A string representation of the observed mention, e.g. "Michael Smith". */ + def string: String +} \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/nlp/coref/AcronymNounPhraseFinder.scala b/src/main/scala/cc/factorie/app/nlp/coref/AcronymNounPhraseFinder.scala new file mode 100644 index 0000000..ced395f --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/coref/AcronymNounPhraseFinder.scala @@ -0,0 +1,26 @@ +package cc.factorie.app.nlp.coref + +import cc.factorie.app.nlp.phrase.{ConllPhraseEntityType, NounPhraseType, Phrase} +import cc.factorie.app.nlp.{Document, Token} + +import scala.collection.mutable + +/** Apply returns a list of acronym noun phrases. + * + * @author Andrew McCallum */ +object AcronymNounPhraseFinder extends MentionPhraseFinder { + def prereqAttrs = Seq(classOf[Token]) + def apply(doc:Document): Seq[Phrase] = { + val result = new mutable.ArrayBuffer[Phrase] + for (section <- doc.sections; token <- section.tokens) { + // Matches middle word of "Yesterday IBM announced" but not "OBAMA WINS ELECTION" + if ( token.string.length > 2 && !token.containsLowerCase && Character.isUpperCase(token.string(0)) && (token.getNext ++ token.getPrev).exists(_.containsLowerCase)) { + val phrase = new Phrase(section, token.positionInSection, length=1,offsetToHeadToken = -1) + phrase.attr += new ConllPhraseEntityType(phrase, "ORG") + phrase.attr += new NounPhraseType(phrase, "NAM") + result += phrase + } + } + result + } +} diff --git a/src/main/scala/cc/factorie/app/nlp/coref/BaseCorefModel.scala b/src/main/scala/cc/factorie/app/nlp/coref/BaseCorefModel.scala new file mode 100644 index 0000000..0c84dde --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/coref/BaseCorefModel.scala @@ -0,0 +1,10 @@ +package cc.factorie.app.nlp.coref + +import cc.factorie.la +import cc.factorie.la.{Tensor1, WeightsMapAccumulator} + +class BaseCorefModel extends PairwiseCorefModel { + val pairwise = Weights(new la.DenseTensor1(MentionPairFeaturesDomain.dimensionDomain.maxSize)) + def predict(pairwiseStats: Tensor1) = pairwise.value dot pairwiseStats + def accumulateObjectiveGradient(accumulator: WeightsMapAccumulator, features: Tensor1, gradient: Double, weight: Double) = accumulator.accumulate(pairwise, features, gradient * weight) +} diff --git a/src/main/scala/cc/factorie/app/nlp/coref/ConjunctionOptions.scala b/src/main/scala/cc/factorie/app/nlp/coref/ConjunctionOptions.scala new file mode 100644 index 0000000..bc7f644 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/coref/ConjunctionOptions.scala @@ -0,0 +1,8 @@ +package cc.factorie.app.nlp.coref + +object ConjunctionOptions { + val NO_CONJUNCTIONS = 1 + val HASH_CONJUNCTIONS = 2 + val SLOW_CONJUNCTIONS = 3 + val PRON_CONJUNCTIONS = 4 +} \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/nlp/coref/CorefConllOutput.scala b/src/main/scala/cc/factorie/app/nlp/coref/CorefConllOutput.scala deleted file mode 100644 index 7b676ca..0000000 --- a/src/main/scala/cc/factorie/app/nlp/coref/CorefConllOutput.scala +++ /dev/null @@ -1,83 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.app.nlp.coref - -import cc.factorie.app.nlp.TokenSpan -import cc.factorie.util.F1Evaluation - -class CorefConllOutput { - val macroMUC = new F1Evaluation - val microB3 = new F1Evaluation - val microMUC = new F1Evaluation - val microCE = new F1Evaluation - val microCM = new F1Evaluation - - def textualOrder(ts1: TokenSpan, ts2: TokenSpan): Int = { - val (s1, e1) = (ts1.head.stringStart, ts1.last.stringEnd) - val (s2, e2) = (ts2.head.stringStart, ts2.last.stringEnd) - - if (s1 == s2) { - if (e1 == e2) 0 - else e1 - e2 - } else s1 - s2 - } - - def beforeInTextualOrder(m1: Mention, m2: Mention): Boolean = { - val o = textualOrder(m1.phrase, m2.phrase) - if (o == 0) textualOrder(m1.phrase, m2.phrase) < 0 - else o < 0 - } - - def printConll2011Format(coref: WithinDocCoref, out: java.io.PrintStream,withSingletons:Boolean = true) { - val entities = coref.entities.toSeq - val mappedMentions = if(!withSingletons) - coref.entities.filterNot(_.isSingleton).toSeq.flatMap(_.mentions).distinct.sortWith((s, t) => beforeInTextualOrder(s, t)) - else entities.flatMap(_.mentions).sortWith((s, t) => beforeInTextualOrder(s, t)) - val (singleTokMents, multiTokMents) = mappedMentions.partition(_.phrase.length == 1) - val beginningTokMap = multiTokMents.groupBy(_.phrase.head) - val endingTokMap = multiTokMents.groupBy(_.phrase.last) - val singleTokMap = singleTokMents.groupBy(_.phrase.head) - val fId = coref.document.name - val docName = fId.substring(0, fId.length() - 4) - val partNum = fId.takeRight(3) - - out.println("#begin document (" + docName + "); part " + partNum) - for (s <- coref.document.sentences) { - for (ti <- 0 until s.tokens.size) { - val beginningMents = beginningTokMap.get(s(ti)) - val endingMents = endingTokMap.get(s(ti)) - val singleTokMents = singleTokMap.get(s(ti)) - assert(singleTokMents.size <= 1) - out.print(docName + " " + partNum.toInt + " " + (ti + 1) + " " + s(ti).string + " " + s(ti).posTag.value + " - - - - - - - ") - var ments = List[String]() - if (beginningMents.isDefined) ments = beginningMents.get.reverse.map(m => "(" + entities.indexOf(m.entity)).mkString("|") :: ments - if (singleTokMents.isDefined) ments = singleTokMents.get.map(m => "(" + entities.indexOf(m.entity)+ ")").mkString("|") :: ments - if (endingMents.isDefined) ments = endingMents.get.reverse.map(m => entities.indexOf(m.entity) + ")").mkString("|") :: ments - if (ments.size > 0) out.println(ments.mkString("|")) - else out.println("-") - } - out.println() - } - out.println("#end document") - } - - def printInhouseScore(name: String = "Test") { - print("--- MICRO ---\n") - print(name+" micro "+microB3.toString("B3") + "\n") - print(name+" micro "+microMUC.toString("MUC") + "\n") - print(name+" micro "+microCE.toString("C-E") + "\n") - print(name+" micro "+microCM.toString("C-M") + "\n") - println("Average: "+(microB3.f1+microMUC.f1+microCE.f1)/3.0) - } -} - diff --git a/src/main/scala/cc/factorie/app/nlp/coref/CorefFeatures.scala b/src/main/scala/cc/factorie/app/nlp/coref/CorefFeatures.scala index eba4940..6f02f72 100644 --- a/src/main/scala/cc/factorie/app/nlp/coref/CorefFeatures.scala +++ b/src/main/scala/cc/factorie/app/nlp/coref/CorefFeatures.scala @@ -1,88 +1,11 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - package cc.factorie.app.nlp.coref -import cc.factorie.app.nlp.lexicon.{StopWords, StaticLexicons} +import cc.factorie.app.nlp.lexicon.StaticLexicons import cc.factorie.app.nlp.ner.OntonotesEntityTypeDomain -import cc.factorie.app.nlp.phrase.{Gender, Number, _} -import cc.factorie.app.nlp.wordnet.WordNet +import cc.factorie.app.nlp.phrase.{Gender, GenderDomain} import scala.collection.mutable -/** Various lazily-evaluated cached characteristics of a Mention, typically attached to a Mention as an attr. */ -class MentionCharacteristics(val mention: Mention, lexicon:StaticLexicons) { - // TODO These should be cleaned up and made more efficient -akm - lazy val isPRO = CorefFeatures.posTagsSet.contains(mention.phrase.headToken.posTag.categoryValue) - lazy val isProper = CorefFeatures.properSet.contains(mention.phrase.headToken.posTag.categoryValue) - lazy val isNoun = CorefFeatures.nounSet.contains(mention.phrase.headToken.posTag.categoryValue) - lazy val isPossessive = CorefFeatures.posSet.contains(mention.phrase.headToken.posTag.categoryValue) - - lazy val hasSpeakWord = mention.phrase.exists(s => lexicon.iesl.Say.contains(s.string)) - lazy val hasSpeakWordContext = prev.exists(w => lexicon.iesl.Say.containsWord(w)) || follow.exists(w => lexicon.iesl.Say.containsWord(w)) - lazy val wnLemma = WordNet.lemma(mention.phrase.headToken.string, "n") - lazy val wnSynsets = WordNet.synsets(wnLemma).toSet - lazy val wnHypernyms = WordNet.hypernyms(wnLemma) - lazy val wnAntonyms = wnSynsets.flatMap(_.antonyms()).toSet - lazy val nounWords: Set[String] = - mention.phrase.tokens.filter(_.posTag.categoryValue.startsWith("N")).map(t => t.string.toLowerCase).toSet - lazy val lowerCaseHead: String = mention.phrase.headToken.string.toLowerCase - lazy val lowerCaseString:String = mention.phrase.string.toLowerCase - lazy val headPhraseTrim: String = mention.phrase.tokensString(" ").trim - lazy val nonDeterminerWords: Seq[String] = - mention.phrase.tokens.filterNot(_.posTag.categoryValue == "DT").map(t => t.string.toLowerCase) - lazy val initials: String = - mention.phrase.tokens.map(_.string).filterNot(lexicon.iesl.OrgSuffix.contains).filter(t => t(0).isUpper).map(_(0)).mkString("") - lazy val predictEntityType: Int = mention.phrase.attr[OntonotesPhraseEntityType].intValue - lazy val demonym: String = lexicon.iesl.DemonymMap.getOrElse(headPhraseTrim, "") - - lazy val capitalization: Char = { - if (mention.phrase.length == 1 && mention.phrase.head.positionInSentence == 0) 'u' // mention is the first word in sentence - else { - val s = mention.phrase.value.filter(_.posTag.categoryValue.startsWith("N")).map(_.string.trim) // TODO Fix this slow String operation - if (s.forall(_.forall(_.isUpper))) 'a' - else if (s.forall(t => t.head.isLetter && t.head.isUpper)) 't' - else 'f' - } - } - lazy val gender = mention.phrase.attr[Gender].categoryValue - lazy val number = mention.phrase.attr[Number].categoryValue - lazy val nounPhraseType = mention.phrase.attr[NounPhraseType].categoryValue - lazy val genderIndex = mention.phrase.attr[Gender].intValue - lazy val numberIndex = mention.phrase.attr[Number].intValue - lazy val nounPhraseTypeIndex = mention.phrase.attr[NounPhraseType].intValue - lazy val headPos = mention.phrase.headToken.posTag.categoryValue - lazy val inParens = mention.phrase.sentence.tokens.exists(t => t.posTag.categoryValue == "LRB" && t.positionInSection < mention.phrase.start) - lazy val prev = Vector(TokenFreqs.getTokenStringAtOffset(mention.phrase(0),-1), TokenFreqs.getTokenStringAtOffset(mention.phrase(0),-2)) - lazy val follow = Vector(TokenFreqs.getTokenStringAtOffset(mention.phrase.last,1), TokenFreqs.getTokenStringAtOffset(mention.phrase.last,2)) - - lazy val acronym: Set[String] = { - if (mention.phrase.length == 1) - Set.empty - else { - val alt1 = mention.phrase.value.map(_.string.trim).filter(_.exists(_.isLetter)) // tokens that have at least one letter character - val alt2 = alt1.filterNot(t => StopWords.contains(t.toLowerCase)) // alt1 tokens excluding stop words - val alt3 = alt1.filter(_.head.isUpper) // alt1 tokens that are capitalized - val alt4 = alt2.filter(_.head.isUpper) - Seq(alt1, alt2, alt3, alt4).map(_.map(_.head).mkString.toLowerCase).toSet - } - } - - lazy val canonicalizedPronounOrType = - if (isPRO) PronounSets.canonicalForms.getOrElse(lowerCaseString,lowerCaseHead) - else nounPhraseType -} - // TODO I think this should be renamed, but I'm not sure to what. -akm object CorefFeatures { val posTagsSet = Set("PRP", "PRP$", "WP", "WP$") @@ -220,7 +143,7 @@ object CorefFeatures { def numbersMatch(m1:Mention, m2:Mention): Ternary = { val n1 = m2.phrase.attr[Number].intValue val n2 = m1.phrase.attr[Number].intValue - import NumberDomain._ + import cc.factorie.app.nlp.phrase.NumberDomain._ if (n1 == n2 && n1 != UNKNOWN) True else if (n1 != n2 && n1 != UNKNOWN && n2 != UNKNOWN) False else if (n1 == UNKNOWN || n2 == UNKNOWN) { @@ -279,78 +202,3 @@ object CorefFeatures { ret } } - -object PronounSets { - val firstPerson = Set("i", "me", "myself", "mine", "my", "we", "us", "ourself", "ourselves", "ours", "our") - val secondPerson = Set("you", "yourself", "yours", "your", "yourselves") - val thirdPerson = Set("he", "him", "himself", "his", "she", "herself", "hers", "her", "it", "itself", "its", "one", "oneself", "one's", "they", "them", "themself", "themselves", "theirs", "their", "'em") - val other = Set("who", "whom", "whose", "where", "when", "which") - - val demonstrative = Set("this", "that", "these", "those") - - val singular = Set("i", "me", "myself", "mine", "my", "yourself", "he", "him", "himself", "his", "she", "her", "herself", "hers", "her", "it", "itself", "its", "one", "oneself", "one's") - val plural = Set("we", "us", "ourself", "ourselves", "ours", "our", "yourself", "yourselves", "they", "them", "themself", "themselves", "theirs", "their") - val male = Set("he", "him", "himself", "his") - val female = Set("her", "hers", "herself", "she") - - val reflexive = Set("herself", "himself", "itself", "themselves", "yourselves", "oneself", "yourself", "themself", "myself") - - val neuter = Set("it", "its", "itself", "this", "that", "anything", "something", "everything", "nothing", "which", "what", "whatever", "whichever") - val personal = Set("you", "your", "yours", "i", "me", "my", "mine", "we", "our", "ours", "us", "myself", "ourselves", "themselves", "themself", "ourself", "oneself", "who", "whom", "whose", "whoever", "whomever", "anyone", "anybody", "someone", "somebody", "everyone", "everybody", "nobody") - - val allPronouns = firstPerson ++ secondPerson ++ thirdPerson ++ other - val allPersonPronouns = allPronouns -- neuter - val canonicalForms = new mutable.HashMap[String,String](){ - ("i", "i") - ("i", "i") - ("me", "i") - ("my", "i") - ("myself", "i") - ("mine", "i") - ("you", "you") - ("your", "you") - ("yourself", "you") - ("yourselves", "you") - ("yours", "you") - ("he", "he") - ("him", "he") - ("his", "he") - ("himself", "he") - ("she", "she") - ("her", "she") - ("herself", "she") - ("hers", "she") - ("we", "we") - ("us", "we") - ("our", "we") - ("ourself", "we") - ("ourselves", "we") - ("ours", "we") - ("they", "they") - ("them", "they") - ("their", "they") - ("themself", "they") - ("themselves", "they") - ("theirs", "they") - ("'em", "they") - ("it", "it") - ("itself", "it") - ("its", "it") - ("one", "one") - ("oneself", "one") - ("one's", "one") - ("this", "this") - ("that", "that") - ("these", "these") - ("those", "those") - ("which", "which") - ("who", "who") - ("whom", "who") - ("thy", "thy") - ("y'all", "you") - ("you're", "you") - ("you'll", "you") - ("'s", "'s") - } -} - diff --git a/src/main/scala/cc/factorie/app/nlp/coref/CorefModel.scala b/src/main/scala/cc/factorie/app/nlp/coref/CorefModel.scala new file mode 100644 index 0000000..13ca675 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/coref/CorefModel.scala @@ -0,0 +1,47 @@ +package cc.factorie.app.nlp.coref + +import java.io.{BufferedInputStream, DataInputStream, DataOutputStream, FileInputStream} + +import cc.factorie.model.Parameters +import cc.factorie.util.BinarySerializer +import cc.factorie.variable.{CategoricalDomain, CategoricalVectorDomain, DiscreteDomain, VectorDomain} + +trait CorefModel extends Parameters { + val MentionPairFeaturesDomain = new CategoricalVectorDomain[String] { + dimensionDomain.maxSize = 1e6.toInt + dimensionDomain.growPastMaxSize = false + } + val MentionPairCrossFeaturesDomain = new VectorDomain { + def dimensionDomain: DiscreteDomain = new DiscreteDomain(5e6.toInt + 1) + } + + val MentionPairLabelDomain = new CategoricalDomain[String] { this += "YES"; this += "NO"; freeze() } + + object CorefTokenFrequencies{ + var counter:TopTokenFrequencies = null + } + + def deserialize(stream: DataInputStream) { + val headWords = new DefaultHashMap[String,Int](0) + BinarySerializer.deserialize(headWords, stream) + BinarySerializer.deserialize(MentionPairFeaturesDomain, stream) + BinarySerializer.deserialize(new CategoricalVectorDomain[String] { val domain = new CategoricalDomain[String]} , stream) + BinarySerializer.deserialize(this, stream) + CorefTokenFrequencies.counter = new TopTokenFrequencies(headWords) + stream.close() + MentionPairFeaturesDomain.freeze() + } + + def deserialize(filename: String) { + deserialize(new DataInputStream(new BufferedInputStream(new FileInputStream(filename)))) + } + + def serialize(stream: DataOutputStream) { + BinarySerializer.serialize(CorefTokenFrequencies.counter.headWords,stream) + MentionPairFeaturesDomain.freeze() + BinarySerializer.serialize(MentionPairFeaturesDomain , stream) + BinarySerializer.serialize(new CategoricalVectorDomain[String] { val domain = new CategoricalDomain[String]}, stream) + BinarySerializer.serialize(this,stream) + } + +} diff --git a/src/main/scala/cc/factorie/app/nlp/coref/CorefOptions.scala b/src/main/scala/cc/factorie/app/nlp/coref/CorefOptions.scala index 51c5225..5d22c8c 100644 --- a/src/main/scala/cc/factorie/app/nlp/coref/CorefOptions.scala +++ b/src/main/scala/cc/factorie/app/nlp/coref/CorefOptions.scala @@ -99,12 +99,5 @@ class CorefOptions { learningRate = opts.learningRate.value conjunctionStyle = conjunctionStyle } - } -object ConjunctionOptions { - val NO_CONJUNCTIONS = 1 - val HASH_CONJUNCTIONS = 2 - val SLOW_CONJUNCTIONS = 3 - val PRON_CONJUNCTIONS = 4 -} \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/nlp/coref/CorefSystem.scala b/src/main/scala/cc/factorie/app/nlp/coref/CorefSystem.scala new file mode 100644 index 0000000..86a8f06 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/coref/CorefSystem.scala @@ -0,0 +1,135 @@ +package cc.factorie.app.nlp.coref + +import java.util.concurrent.ExecutorService + +import cc.factorie.app.nlp.lexicon.{LexiconsProvider, StaticLexicons} +import cc.factorie.app.nlp.phrase._ +import cc.factorie.app.nlp.pos.PennPosTag +import cc.factorie.app.nlp.{Document, DocumentAnnotator, Token} +import cc.factorie.optimize._ +import cc.factorie.util.Trackable + + +/**Base class for any coreference system + * + * @tparam CoreferenceStructure The type used as a training instance, ex. MentionPairLabel or MentionGraph, + * In the examples above, the training instance is either one pair or the whole document respectively*/ +abstract class CorefSystem[CoreferenceStructure] extends DocumentAnnotator with Trackable{ + val model:CorefModel + val options:CorefOptions + def prereqAttrs: Seq[Class[_]] = Seq(classOf[Token],classOf[PennPosTag]) + def postAttrs = Seq(classOf[WithinDocCoref]) + def tokenAnnotationString(token:Token): String = { + val entities = token.document.coref.entities.toSeq + var outputString = token.document.coref.mentions.filter(mention => mention.phrase.contains(token)) match { + case ms:Seq[Mention] if ms.length > 0 => ms.filter(m => m.entity != null && !m.entity.isSingleton).map{ + m => if (m.phrase.length == 1) "("+entities.indexOf(m.entity)+")" + else if(m.phrase.indexOf(token) == 0) "("+entities.indexOf(m.entity) + else if(m.phrase.indexOf(token) == m.phrase.length - 1) entities.indexOf(m.entity)+")" + else "" + }.mkString("|") + case _ => "_" + } + if(outputString == "") outputString = "_" + else if(outputString.endsWith("|")) outputString = outputString.substring(0,outputString.length-1) + "%15s".format(outputString) + } + + def process(document: Document) = { + document.annotators += classOf[WithinDocCoref] -> this.getClass + if(document.getCoref.mentions.isEmpty) + annotateMentions(document) + infer(document.getCoref) + document + } + + def annotateMentions(document: Document): Unit = { + if(options.useGoldBoundaries){ + assert(document.targetCoref ne null,"Gold Boundaries cannot be used without gold data.") + document.targetCoref.mentions.foreach{m => + if(options.useEntityType){ + val newMention = document.getCoref.addMention(new Phrase(m.phrase.value.chain,m.phrase.start,m.phrase.length,m.phrase.headTokenOffset)) + newMention.phrase.attr += m.phrase.attr[OntonotesPhraseEntityType] + newMention.phrase.attr += m.phrase.attr[NounPhraseType] + } + else { + val newMention = document.getCoref.addMention(new Phrase(m.phrase.value.chain,m.phrase.start,m.phrase.length,m.phrase.headTokenOffset)) + NounPhraseEntityTypeLabeler.process(newMention.phrase) + newMention.phrase.attr += m.phrase.attr[NounPhraseType] + } + } + NounPhraseGenderLabeler.process(document) + MentionPhraseNumberLabeler.process(document) + } + } + + /**Perform any preprocessing such as getting top used words + * @param trainDocs Documents to generate counts from */ + def preprocessCorpus(trainDocs: Seq[Document]): Unit + + /**Returns training labels for data in the format that should be used for training + * @param coref Gold Coref to be used for training */ + def getCorefStructure(coref: WithinDocCoref): CoreferenceStructure + def instantiateModel(optimizer: GradientOptimizer,pool: ExecutorService): ParallelTrainer + def infer(doc: WithinDocCoref): WithinDocCoref + + abstract class ParallelTrainer(optimizer: GradientOptimizer, val pool: ExecutorService) { + def map(in: CoreferenceStructure): Seq[Example] + def reduce(states: Iterable[Seq[Example]]) { + for (examples <- states) { + val trainer = new OnlineTrainer(model.parameters, optimizer, maxIterations = 1, logEveryN = examples.length - 1) + trainer.trainFromExamples(examples) + } + } + def runParallel(ins: Seq[CoreferenceStructure]){ + reduce(cc.factorie.util.Threading.parMap(ins, pool)(map)) + } + def runSequential(ins: Seq[CoreferenceStructure]){ + reduce(ins.map(map)) + } + } + + + // todo fix this + @deprecated("This exists to preserve prior behavior, it should be a constructor argument", "10/5/15") + val lexicon = new StaticLexicons()(LexiconsProvider.classpath()) + + // No training in this library +/* + def train(trainDocs: Seq[Document], testDocs: Seq[Document], wn: WordNet, rng: scala.util.Random, saveModelBetweenEpochs: Boolean,saveFrequency: Int,filename: String, learningRate: Double = 1.0): Double = { + val optimizer = if (options.useAverageIterate) new AdaGrad(learningRate) with ParameterAveraging else if (options.useAdaGradRDA) new AdaGradRDA(rate = learningRate,l1 = options.l1) else new AdaGrad(rate = learningRate) + for(doc <- trainDocs; mention <- doc.targetCoref.mentions) mention.attr += new MentionCharacteristics(mention, lexicon) + preprocessCorpus(trainDocs) + |**("Training Structure Generated") + var i = 0 + val trainingFormat: Seq[CoreferenceStructure] = trainDocs.map{doc => i +=1 ; if(i % 100 == 0) println("Processing Labels for: " + i + " of " + trainDocs.size); getCorefStructure(doc.targetCoref)} + **| + val pool = java.util.concurrent.Executors.newFixedThreadPool(options.numThreads) + var accuracy = 0.0 + try { + val trainer = instantiateModel(optimizer, pool) + for (iter <- 0 until options.numTrainingIterations) { + val shuffledDocs = rng.shuffle(trainingFormat) + val batches = shuffledDocs.grouped(options.featureComputationsPerThread*options.numThreads).toSeq + for ((batch, b) <- batches.zipWithIndex) { + if (options.numThreads > 1) trainer.runParallel(batch) + else trainer.runSequential(batch) + } + if (!model.MentionPairFeaturesDomain.dimensionDomain.frozen) model.MentionPairFeaturesDomain.dimensionDomain.freeze() + if (!options.useAdaGradRDA && options.useAverageIterate) optimizer match {case o: ParameterAveraging => o.setWeightsToAverage(model.parameters) } + println("Train docs") + doTest(trainDocs.take((trainDocs.length*options.trainPortionForTest).toInt), wn, "Train") + println("Test docs") + |**("Running Test") + accuracy = doTest(testDocs, wn, "Test") + **|("End Test") + if(saveModelBetweenEpochs && iter % saveFrequency == 0) + serialize(filename + "-" + iter) + if (!options.useAdaGradRDA && options.useAverageIterate) optimizer match {case o: ParameterAveraging => o.unSetWeightsToAverage(model.parameters) } + } + if (!options.useAdaGradRDA&& options.useAverageIterate) optimizer match {case o: ParameterAveraging => o.setWeightsToAverage(model.parameters) } + accuracy + } finally { + pool.shutdown() + } */ + } \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/nlp/coref/CorefTrainer.scala b/src/main/scala/cc/factorie/app/nlp/coref/CorefTrainer.scala deleted file mode 100644 index 03c1b24..0000000 --- a/src/main/scala/cc/factorie/app/nlp/coref/CorefTrainer.scala +++ /dev/null @@ -1,347 +0,0 @@ - -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -package cc.factorie.app.nlp.coref - -import cc.factorie.app.nlp.lexicon.{LexiconsProvider, StaticLexicons} -import cc.factorie.app.nlp.load.LoadConll2011 -import cc.factorie.app.nlp.ner.{ConllChainNer, NerTag} -import cc.factorie.app.nlp.phrase._ -import cc.factorie.app.nlp.wordnet.WordNet -import cc.factorie.app.nlp.{Document, DocumentAnnotatorPipeline, MutableDocumentAnnotatorMap} -import cc.factorie.util.{HyperparameterMain, TimingCollector, Trackable, Trackers} - -/** Trainers for Coreference Systems*/ -trait ForwardCorefTrainerOpts extends CorefTrainerOpts{ - val numPositivePairsTrain = new CmdOption("prune-train", 2, "INT", "number of positive pairs before pruning instances in training") - val numPositivePairsTest = new CmdOption("prune-test", 100, "INT", "number of positive pairs before pruning instances in testing") - val numThreads = new CmdOption("num-threads", 4, "INT", "Number of threads to use") - val featureComputationsPerThread = new CmdOption("feature-computations-per-thread", 2, "INT", "Number of feature computations per thread to run in parallel during training") - val numTrainingIterations = new CmdOption("num-training-iterations", 4, "INT", "Number of passes through the training data") - val useMIRA = new CmdOption("use-mira", false, "BOOLEAN", "Whether to use MIRA as an optimizer") - val saveFrequency = new CmdOption("save-frequency", 1, "INT", "how often to save the model between epochs") - val trainPortionForTest = new CmdOption("train-portion-for-test", 0.1, "DOUBLE", "When testing on train, what portion to use.") - val mergeFeaturesAtAll = new CmdOption("merge-features-at-all", true, "BOOLEAN", "Whether to merge features") - val conjunctionStyle = new CmdOption("conjunction-style", "NONE", "NONE|HASH|SLOW", "What types of conjunction features to use - options are NONE, HASH, and SLOW (use slow string-based conjunctions).") - val entityLR = new CmdOption("entity-left-right",false,"BOOLEAN","whether to do entity-based pruning in lr search") - val slackRescale = new CmdOption("slack-rescale",2.0,"FLOAT","recall bias for hinge loss") - val useEntityType = new CmdOption("use-entity-type",true,"BOOLEAN","whether to use entity type info") - val mergeAppositions = new CmdOption("merge-appositions",false,"BOOLEAN","whether to merge appositions as a rule") - val usePronounRules = new CmdOption("use-pronoun-rules",false,"BOOLEAN","whether to do deterministic assigning of pronouns and not consider pronouns for training") - val trainSeparatePronounWeights = new CmdOption("separate-pronoun-weights",true,"BOOLEAN","train a separate weight vector for pronoun-pronoun comparison") - val numCompareToTheLeft = new CmdOption("num-compare-to-the-left",75,"INT","number of mentions to compare to the left before backing off to only looking at non-pronouns and those in entities (only used if entityLR == true)") - val learningRate = new CmdOption("learning-rate",1.0,"FLOAT","learning rate") - val serialize = new CmdOption("serialize", "ForwardCoref.factorie", "FILE", "Filename in which to serialize classifier.") - val deserialize = new CmdOption("deserialize", "", "FILE", "Filename from which to deserialize classifier.") - val useAverageIterate = new CmdOption("use-average-iterate", true, "BOOLEAN", "Use the average iterate instead of the last iterate?") -} -/* -object ForwardCorefTrainer extends CorefTrainer{ - object opts extends ForwardCorefTrainerOpts - - // todo fix this - @deprecated("This exists to preserve prior behavior, it should be a constructor argument", "10/5/15") - val lexicon = new StaticLexicons()(LexiconsProvider.classpath()) - - def evaluateParameters(args: Array[String]): Double = { - opts.parse(args) - val conjunctionStyle = opts.conjunctionStyle.value match { - case "NONE" => ConjunctionOptions.NO_CONJUNCTIONS - case "HASH" => ConjunctionOptions.HASH_CONJUNCTIONS - case "SLOW" => ConjunctionOptions.SLOW_CONJUNCTIONS - case s => sys.error("Unknown conjunction style: " + s) - } - - val lr = if (conjunctionStyle == ConjunctionOptions.HASH_CONJUNCTIONS) new ForwardCorefImplicitConjunctions else new ForwardCoref - - val options = lr.options - //options that get serialized with the model - options.setParameterOptions(opts) - options.setConfig("useEntityType",opts.useEntityType.value) - options.setConfig("trainSeparatePronounWeights",opts.trainSeparatePronounWeights.value) - // options which affect only learning - - println("** Arguments") - val ignoreOpts = Set("config", "help", "version") - for (o <- opts.values.toSeq.sortBy(_.name); if !ignoreOpts(o.name)) println(o.name + " = " + o.value) - - val timer = new TimingCollector() - Trackers += timer - - val rng = new scala.util.Random(opts.randomSeed.value) - val loadTrain = !opts.deserialize.wasInvoked - val (trainDocs,testDocs) = if(opts.useGoldBoundaries.value ) makeTrainTestDataGoldBoundaries(opts.trainFile.value,opts.testFile.value,loadTrain) - else makeTrainTestData(opts.trainFile.value,opts.testFile.value,options, loadTrain, opts.useNerMentions.value) - - addGenderNumberLabeling(trainDocs,testDocs) - println(timer.timings) - - - if (opts.deserialize.wasInvoked){ - - val lr = if(opts.deserialize.value == "NerForwardCoref.factorie") new NerForwardCoref() - else if (opts.deserialize.value == "ParseForwardCoref.factorie") new ParseForwardCoref() - else new ForwardCoref() - - //copy over options that are tweakable at test time - println("deserializing from " + opts.deserialize.value) - - lr.deserialize(opts.deserialize.value) //note that this may overwrite some of the options specified at the command line. The idea is that there are certain options that must be consistent - //between train and test. These options were serialized with the model, and set when you deserialize the model. - - //However, there are some options that are safe to change at test time. Just to be extra sure, we set this manually back - lr.options.setConfig("usePronounRules",options.usePronounRules) //this is safe to tweak at test time if you train separate weights for all the pronoun cases - - lr.model.MentionPairFeaturesDomain.freeze() - //Add Cached Mention Features - for(doc <- testDocs; mention <- doc.coref.mentions) mention.attr += new MentionCharacteristics(mention, lexicon) - - lr.doTest(testDocs, WordNet, "Test") - } - else{ - lr.train(trainDocs, testDocs, WordNet, rng, opts.saveFrequency.wasInvoked,opts.saveFrequency.value,opts.serialize.value, opts.learningRate.value) - println(timer.timings) - if (opts.serialize.wasInvoked) - lr.serialize(opts.serialize.value) - } - - - if (opts.writeConllFormat.value) - writeConllOutput(testDocs) - - val accuracy = 0.0 - if(opts.targetAccuracy.wasInvoked) cc.factorie.assertMinimalAccuracy(accuracy,opts.targetAccuracy.value.toDouble) - - accuracy - } -} - -object StructuredCorefTrainer extends CorefTrainer{ - object ProbCorefTrainerOpts extends CorefTrainerOpts{ - val maxMentDistance = new CmdOption("prune-train", 2, "INT", "number of positive pairs before pruning instances in training") - val numPositivePairsTest = new CmdOption("prune-test", 100, "INT", "number of positive pairs before pruning instances in testing") - val numThreads = new CmdOption("num-threads", 4, "INT", "Number of threads to use") - val numTrainingIterations = new CmdOption("num-training-iterations", 20, "INT", "Number of iterations to use for training") - val saveFrequency = new CmdOption("save-frequency", 4, "INT", "how often to save the model between epochs") - val learningRate = new CmdOption("learning-rate",1.0,"FLOAT","learning rate") - val featureSet = new CmdOption("feature-set","lexical","LEXICAL|CONVENTIONAL","Feature set to use for this model") - val l1 = new CmdOption("l1", .0001, "INT", "l1 regularizer for adaGradRDA") - val useAverageIterate = new CmdOption("use-average-iterate", true, "BOOLEAN", "Use the average iterate instead of the last iterate?") - val serialize = new CmdOption("serialize", "StructuredCoref.factorie","FILE","Filename in which to serialize classifier.") - val deserialize = new CmdOption("deserialize", "", "FILE", "Filename from which to deserialize classifier.") - } - val opts = ProbCorefTrainerOpts - def evaluateParameters(args:Array[String]):Double = { - opts.parse(args) - //two varibles taken from ForwardCoreferenceTrainer - val rng = new scala.util.Random(opts.randomSeed.value) - val loadTrain = !opts.deserialize.wasInvoked - val coreferenceSystem = new StructuredCoref - val options = coreferenceSystem.options - options.featureSet="lexical" - options.learningRate = opts.learningRate.value - options.l1 = opts.l1.value - options.useAverageIterate = opts.useAverageIterate.value - options.useAdaGradRDA = false - options.numTrainingIterations = opts.numTrainingIterations.value - options.useGoldBoundaries = opts.useGoldBoundaries.value - options.useNERMentions = opts.useNerMentions.value - println("** Arguments") - val ignoreOpts = Set("config", "help", "version") - for (o <- opts.values.toSeq.sortBy(_.name); if !ignoreOpts(o.name)) println(o.name + " = " + o.value) - - println("Loading Documents") - val (trainDocs,testDocs) = if(options.useGoldBoundaries) makeTrainTestDataGoldBoundaries(opts.trainFile.value,opts.testFile.value,loadTrain) - else makeTrainTestData(opts.trainFile.value, opts.testFile.value,options, loadTrain, opts.useNerMentions.value) - - addGenderNumberLabeling(trainDocs,testDocs) - - var accuracy = 0.0 - if (opts.deserialize.wasInvoked){ - //copy over options that are tweakable at test time - println("deserializing from " + opts.deserialize.value) - val testSystem = if(opts.deserialize.value == "NerStructuredCoref.factorie") new NerStructuredCoref() - else if (opts.deserialize.value == "ParseStructuredCoref.factorie") new ParseStructuredCoref() - else new StructuredCoref() - testSystem.deserialize(opts.deserialize.value) //note that this may overwrite some of the options specified at the command line. The idea is that there are certain options that must be consistent - //between train and test. These options were serialized with the model, and set when you deserialize the model. - options.featureSet = "lexical" - testSystem.model.MentionPairFeaturesDomain.freeze() - accuracy = testSystem.doTest( testDocs, WordNet,"Test") - testSystem - }else{ - accuracy = coreferenceSystem.train(trainDocs,testDocs, WordNet, rng,opts.saveFrequency.wasInvoked,opts.saveFrequency.value,opts.serialize.value, opts.learningRate.value) - if (opts.serialize.wasInvoked && !opts.deserialize.wasInvoked) - coreferenceSystem.serialize(opts.serialize.value) - } - - if(opts.writeConllFormat.value) - writeConllOutput(testDocs) - testDocs.head.tokens.foreach{t => println(coreferenceSystem.tokenAnnotationString(t))} - accuracy - } -} -*/ -trait CorefTrainerOpts extends cc.factorie.util.DefaultCmdOptions with cc.factorie.app.nlp.SharedNLPCmdOptions{ - val trainFile = new CmdOption("train", "src/main/resources/conll-train-clean.txt", "STRING", "File with training data") - val testFile = new CmdOption("test", "src/main/resources/conll-test-clean.txt", "STRING", "File with testing data") - val useExactEntTypeMatch = new CmdOption("use-exact-entity-type-match", true, "BOOLEAN", "whether to require exact alignment between NER annotation and NP annotation") - val useGoldBoundaries = new CmdOption("use-gold-boundaries",false,"BOOLEAN","whether to use gold parse boundaries + gold mention boundaries") - val mentionAlignmentShiftWidth = new CmdOption("alignment-width",0,"INT","tolerance on boundaries when aligning detected mentions to gt mentions") - val portion = new CmdOption("portion", 1.0, "DOUBLE", "Portion of corpus to load.") - val useNerMentions = new CmdOption("use-ner-mentions", false, "BOOLEAN", "Whether to use NER mentions instead of noun phrase mentions") - val randomSeed = new CmdOption("random-seed", 0, "INT", "Seed for the random number generator") - val writeConllFormat = new CmdOption("write-conll-format", true, "BOOLEAN", "Write CoNLL format data.") -} -/** Classes shared by both coref systems*/ -/* -abstract class CorefTrainer extends HyperparameterMain with Trackable{ - def evaluateParameters(args: Array[String]): Double - - def opts:CorefTrainerOpts - - def addGenderNumberLabeling(trainDocs:Seq[Document], testDocs:Seq[Document]){ - |**("Adding Training Gender Labels") - if(trainDocs ne null){ - for (doc <- trainDocs; mention <- doc.targetCoref.mentions) { - NounPhraseGenderLabeler.process(mention.phrase) - NounPhraseNumberLabeler.process(mention.phrase) - DeterministicNounPhraseTypeLabeler.process(mention.phrase) - } - - for (doc <- trainDocs; mention <- doc.coref.mentions) { - NounPhraseGenderLabeler.process(mention.phrase) - NounPhraseNumberLabeler.process(mention.phrase) - DeterministicNounPhraseTypeLabeler.process(mention.phrase) - } - } - for (doc <- testDocs; mention <- doc.coref.mentions) { - NounPhraseGenderLabeler.process(mention.phrase) - NounPhraseNumberLabeler.process(mention.phrase) - DeterministicNounPhraseTypeLabeler.process(mention.phrase) - } - **| - } - - def makeTrainTestDataGoldBoundaries(trainFile: String, testFile: String, loadTrain: Boolean): (Seq[Document],Seq[Document]) = { - var trainDocs: Seq[Document] = null - if (loadTrain){ - val allTrainDocs = LoadConll2011.loadWithParse(trainFile) - trainDocs = allTrainDocs.take((allTrainDocs.length*opts.portion.value).toInt) - for(doc <- trainDocs; mention <- doc.getTargetCoref.mentions){ - assert(mention.phrase.attr[OntonotesPhraseEntityType] ne null,"missing entity type") - doc.coref.addMention(mention.phrase).phrase.attr += mention.phrase.attr[OntonotesPhraseEntityType] - } - println("Train: "+trainDocs.length+" documents, " + trainDocs.map(d => d.coref.mentions.size).sum.toFloat / trainDocs.length + " mentions/doc") - } - val allTestDocs = LoadConll2011.loadWithParse(testFile) - val testDocs = allTestDocs.take((allTestDocs.length*opts.portion.value).toInt) - for(doc <- testDocs; mention <- doc.getTargetCoref.mentions){ - assert(mention.phrase.attr[OntonotesPhraseEntityType] ne null,"missing entity type") - doc.coref.addMention(mention.phrase).phrase.attr += mention.phrase.attr[OntonotesPhraseEntityType] - } - println("Test : "+ testDocs.length+" documents, " + testDocs.map(d => d.coref.mentions.size).sum.toFloat / testDocs.length + " mention/doc") - - (trainDocs,testDocs) - } - - def makeTrainTestData(trainFile: String, testFile: String, options: CorefOptions, loadTrain: Boolean, useNerMentions: Boolean): (Seq[Document],Seq[Document]) = { - val map = new MutableDocumentAnnotatorMap ++= DocumentAnnotatorPipeline.defaultDocumentAnnotationMap - if (useNerMentions) { - map(classOf[NerTag]) = () => ConllChainNer - } - var trainDocs: Seq[Document] = null - if(loadTrain){ - val allTrainDocs = LoadConll2011.loadWithParse(trainFile, loadSingletons = false) - val unalignedTrainDocs = allTrainDocs.take((allTrainDocs.length*opts.portion.value).toInt) - trainDocs = MentionAlignment.makeLabeledData(unalignedTrainDocs,null,options.useEntityType, options, map.toMap) - println("Train: "+trainDocs.length+" documents, " + trainDocs.map(d => d.targetCoref.mentions.size).sum.toFloat / trainDocs.length + " mentions/doc") - } - - val testDocs: Seq[ Document] = { - val allTestDocs = LoadConll2011.loadWithParse(testFile, loadSingletons = false) - val unalignedTestDocs = allTestDocs.take((allTestDocs.length*opts.portion.value).toInt) - MentionAlignment.makeLabeledData(unalignedTestDocs,null,options.useEntityType, options, map.toMap) - } - println("Test : "+ testDocs.length+" documents, " + testDocs.map(d => d.targetCoref.mentions.size).sum.toFloat / testDocs.length + " mention/doc") - if(!useNerMentions){ - val labeler = NounPhraseEntityTypeLabeler - if (loadTrain) for (doc <- trainDocs; mention <- doc.coref.mentions) labeler.process(mention.phrase) - for (doc <- testDocs; mention <- doc.coref.mentions) labeler.process(mention.phrase) - } - (trainDocs,testDocs) - } - - def writeConllOutput(testDocs:Seq[Document]){ - val conllFormatPrinter = new CorefConllOutput - val conllFormatFilt = new java.io.PrintStream(new java.io.File("eval-test.filtpred")) - testDocs.foreach(d => conllFormatPrinter.printConll2011Format(d.getCoref, conllFormatFilt,withSingletons = false)) - conllFormatFilt.flush() - conllFormatFilt.close() - - val conllFormatNonFilt = new java.io.PrintStream(new java.io.File("eval-test-key.filtgold")) - testDocs.foreach{d => d.targetCoref.removeSingletons(); conllFormatPrinter.printConll2011Format(d.getTargetCoref, conllFormatNonFilt,withSingletons = false)} - conllFormatNonFilt.flush() - conllFormatNonFilt.close() - } - -} -*/ -/* -object StructuredCorefOptimizer{ - def main(args: Array[String]) { - val opts = StructuredCorefTrainer.ProbCorefTrainerOpts - opts.parse(args) - opts.serialize.setValue("") - val l1 = cc.factorie.util.HyperParameter(opts.l1, new cc.factorie.util.SampleFromSeq(List(0.000005,0.00005,0.0005,.0001,.00001))) - val rate = cc.factorie.util.HyperParameter(opts.learningRate, new cc.factorie.util.SampleFromSeq(List(0.1,0.5,0.8,1,1.2))) - - val qs = new cc.factorie.util.QSubExecutor(40, "cc.factorie.app.nlp.coref.StructuredCorefTrainer") - val optimizer = new cc.factorie.util.HyperParameterSearcher(opts, Seq(l1, rate), qs.execute, 40, 18, 60) - val result = optimizer.optimize() - println("Got results: " + result.mkString(" ")) - println("Best l1: " + opts.l1.value + " best lr: " + opts.learningRate.value) - println("Running best configuration...") - opts.serialize.setValue("StructuredCoref.factorie") - import scala.concurrent.Await - import scala.concurrent.duration._ - Await.result(qs.execute(opts.values.flatMap(_.unParse).toArray), 6.hours) - println("Done") - } -} -*/ -/* -object ForwardCorefOptimizer{ - def main(args: Array[String]) { - val opts = ForwardCorefTrainer.opts - opts.parse(args) - opts.serialize.setValue("") - - val l1 = cc.factorie.util.HyperParameter(opts.numTrainingIterations, new cc.factorie.util.SampleFromSeq(List(1,2,3,4,5))) - val rate = cc.factorie.util.HyperParameter(opts.learningRate, new cc.factorie.util.SampleFromSeq(List(0.1,0.5,0.8,1,1.2,1.5))) - - val qs = new cc.factorie.util.QSubExecutor(40, "cc.factorie.app.nlp.coref.ForwardCorefTrainer") - val optimizer = new cc.factorie.util.HyperParameterSearcher(opts, Seq(l1, rate), qs.execute, 40, 22, 60) - val result = optimizer.optimize() - println("Got results: " + result.mkString(" ")) - println("Best rate: " + opts.learningRate.value + " best l1: " + opts.numTrainingIterations.value) - opts.serialize.setValue("ForwardCoref.factorie") - println("Running best configuration...") - import scala.concurrent.Await - import scala.concurrent.duration._ - Await.result(qs.execute(opts.values.flatMap(_.unParse).toArray), 6.hours) - println("Done") - } -} -*/ \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/nlp/coref/CorefTrainerOpts.scala b/src/main/scala/cc/factorie/app/nlp/coref/CorefTrainerOpts.scala new file mode 100644 index 0000000..8b8f2a4 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/coref/CorefTrainerOpts.scala @@ -0,0 +1,13 @@ +package cc.factorie.app.nlp.coref + +trait CorefTrainerOpts extends cc.factorie.util.DefaultCmdOptions { + val trainFile = new CmdOption("train", "src/main/resources/conll-train-clean.txt", "STRING", "File with training data") + val testFile = new CmdOption("test", "src/main/resources/conll-test-clean.txt", "STRING", "File with testing data") + val useExactEntTypeMatch = new CmdOption("use-exact-entity-type-match", true, "BOOLEAN", "whether to require exact alignment between NER annotation and NP annotation") + val useGoldBoundaries = new CmdOption("use-gold-boundaries",false,"BOOLEAN","whether to use gold parse boundaries + gold mention boundaries") + val mentionAlignmentShiftWidth = new CmdOption("alignment-width",0,"INT","tolerance on boundaries when aligning detected mentions to gt mentions") + val portion = new CmdOption("portion", 1.0, "DOUBLE", "Portion of corpus to load.") + val useNerMentions = new CmdOption("use-ner-mentions", false, "BOOLEAN", "Whether to use NER mentions instead of noun phrase mentions") + val randomSeed = new CmdOption("random-seed", 0, "INT", "Seed for the random number generator") + val writeConllFormat = new CmdOption("write-conll-format", true, "BOOLEAN", "Write CoNLL format data.") +} diff --git a/src/main/scala/cc/factorie/app/nlp/coref/DefaultHashMap.scala b/src/main/scala/cc/factorie/app/nlp/coref/DefaultHashMap.scala new file mode 100644 index 0000000..847ba96 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/coref/DefaultHashMap.scala @@ -0,0 +1,7 @@ +package cc.factorie.app.nlp.coref + +import scala.collection.mutable + +class DefaultHashMap[String,Int](val defaultValue: Int) extends mutable.HashMap[String,Int] { + override def default(key:String) = defaultValue +} diff --git a/src/main/scala/cc/factorie/app/nlp/coref/DeterministicCoref.scala b/src/main/scala/cc/factorie/app/nlp/coref/DeterministicCoref.scala deleted file mode 100644 index 41f8df5..0000000 --- a/src/main/scala/cc/factorie/app/nlp/coref/DeterministicCoref.scala +++ /dev/null @@ -1,2435 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.app.nlp.coref - -import cc.factorie.app.nlp.lexicon.{LexiconsProvider, StaticLexicons} -import java.util.concurrent.ExecutorService - -import cc.factorie.app.nlp.{Document, _} -import cc.factorie.app.nlp.ner._ -import cc.factorie.app.nlp.parse._ -import cc.factorie.app.nlp.phrase._ -import cc.factorie.app.nlp.pos._ -import cc.factorie.app.nlp.wordnet._ -import cc.factorie.util._ -import cc.factorie.variable.{CategoricalVariable, EnumDomain} - -import scala.collection.immutable.StringOps -import scala.collection.mutable.{HashMap, HashSet} - - - -/** - * This file contains an implementation of the Deterministic Within-Document Coreference System described - * in the papers: - * - * Heeyoung Lee, Yves Peirsman, Angel Chang, Nathanael Chambers, Mihai Surdeanu, Dan Jurafsky. - * Stanford's Multi-Pass Sieve Coreference Resolution System at the CoNLL-2011 Shared Task. - * In Proceedings of the CoNLL-2011 Shared Task, 2011. - * - * Karthik Raghunathan, Heeyoung Lee, Sudarshan Rangarajan, Nathanael Chambers, Mihai Surdeanu, Dan Jurafsky, Christopher Manning - * A Multi-Pass Sieve for Coreference Resolution - * EMNLP-2010, Boston, USA. 2010. - * - * There are a few differences in this implementation and the system described in the above papers. - * Most significantly, there is no discourse processing in this implementation. Experiments were performed using the - * "gold" speaker annotations and the discourse processing sieve. These experiments revealed that the discourse processing - * only increased the F1 score of this system a small amount.Additionally, this system does not make use of all of the - * external resources (Freebase, etc) that the system described in the papers uses. - * - * Other differences include a more restrictive condition on personal pronoun agreement in the Pronoun Sieve and a loosening - * of the NER agreement constraint in the Relaxed Head Matching Sieve. - * - * The performance of this deterministic coreference system is not as good as Stanford's implementation. After spending - * quite a bit of time performing error analysis, it seems that the errors made by this system stem from parses, NER labels, - * and head word identifications that our system computes differently than Stanford's system. These differences are sometimes - * not mistakes, but cause deterministic decisions to be made incorrectly. Many of the errors seen were mistakes - * identifying the head word in a phrase, worsening the performance of the Strict Head Matching Sieve. - * - * The performance results on the Conll 2011 test set are shown below. The results presented below used mentions predicted - * by Stanford's implementation as input to this system. - * The below results for Stanford's system are from http://conll.cemantix.org/download/re-scoring-conll-evaluations.v16.xlsx - * - * Metric | System | Recall | Precision | F1 - * -------|----------|------------|-------------|---------- - * MUC | factorie | 60.54% | 56.11% | 58.24% - * MUC | Stanford | --- | --- | 59.57% - * B3 | factorie | 45.68% | 49.97% | 47.73% - * B3 | Stanford | --- | --- | 48.93% - * ceafm | factorie | 53.79% | 46.85% | 50.08% - * ceafm | Stanford | --- | --- | 53.04% - * ceafe | factorie | 53.54% | 39.21% | 46.27% - * ceafe | Stanford | --- | --- | 46.11% - * blanc | factorie | 34.66% | 51.65% | 41.49% - * blanc | Stanford | --- | --- | 48.84% - * - * - * Next steps would be to continue to implement the Discourse Processing Sieve as well as to continue to investigate - * where errors in the system stem from or perhaps make the system more robust to slight variations in things such as NER labellings, etc. - * - * Also, note that there is a debug infrastructure in place for this system, it is defined by the trait DeterministicCorefDebug - * It generates a folder of HTML files for each document processed by the system. There is an HTML file for each Sieve of the - * system, which contains a clear representation of all of the decisions made in that sieve as well as the mention cluster assignments - * after the sieve is completed. The debug infrastructure can be used by passing the name of an output directory as an argument - * into the sieves. - * - * @author Nicholas Monath - */ - - -/** - * Domain for labeling documents with "types" such as Article or Conversation, for the - * sake of a coreference system. - */ -object CorefDocumentTypeDomain extends EnumDomain { - val UNKNOWN, // uncertain - ARTICLE, // Article, newswire etc - CONVERSATION = Value // conversational Text - freeze() -} - -/** - * Variable for the CorefDocumentTypeDomain - * - */ -class CorefDocumentType extends CategoricalVariable[String] { - def this(value:String) = { this(); _initialize(domain.index(value)) } - def this(value:Int) = { this(); _initialize(value) } - def domain = CorefDocumentTypeDomain -} - - - -/** - * A collection of fields used in the deterministic coreference system. - * @param mention - */ -class DeterministicCorefCache(val mention: Mention) { - lazy val mentionSentence: Sentence = mention.phrase.sentence - lazy val relaxedMentionString: String = CorefUtil.relaxString(mention) - lazy val absoluteSentenceNumber: Int = CorefUtil.absoluteSentenceNumber(mention) - lazy val isIndefiniteArticleOrPronoun = (CorefUtil.indefiniteArticles.contains(mention.phrase.tokens(0).string.toLowerCase.trim) || CorefUtil.indefinitePronouns.contains(mention.phrase.tokens(0).string.toLowerCase.trim)) -} - - -/** - * A domain representing the animacy of tokens & mentions - */ -object DCorefAnimacyDomain extends EnumDomain { - val UNKNOWN, // uncertain - ANIMATE, // Animate Object - INANIMATE = Value // Inanimate object - freeze() -} - -/** - * A variable storing an Animacy value - */ -class DCorefAnimacy extends CategoricalVariable[String] { - def this(value:String) = { this(); _initialize(domain.index(value)) } - def this(value:Int) = { this(); _initialize(value) } - def domain = DCorefAnimacyDomain -} - -/** - * The extension of the animacy categorical variable to phrases - * @param phrase - * @param value - */ -class DCorefPhraseAnimacy(val phrase:Phrase, value:Int) extends DCorefAnimacy(value) { - def this(phrase:Phrase, value:String) = this(phrase, DCorefAnimacyDomain.index(value)) -} - -/** - * A domain representing the person (e.g. 1st, 2nd, 3rd) of a mention - */ -object DCorefPersonDomain extends EnumDomain { - val UNKNOWN, // uncertain - FIRST, // 1st person, I, me, us, etc ... - SECOND, // 2nd person, you, your, etc ... - THIRD = Value // 3rd person, he, she, it, the car, the dog, etc ... - freeze() -} - -/** - * A variable storing a Person value - */ -class DCorefPerson extends CategoricalVariable[String] { - def this(value:String) = { this(); _initialize(domain.index(value)) } - def this(value:Int) = { this(); _initialize(value) } - def domain = DCorefPersonDomain -} - -/** - * The extension of the person categorical variable to phrases - * @param phrase - * @param value - */ -class DCorefPhrasePerson(val phrase:Phrase, value:Int) extends DCorefPerson(value) { - def this(phrase:Phrase, value:String) = this(phrase, DCorefPersonDomain.index(value)) -} - - - -/** - * CorefUtil is a class which provides several methods which are used in the various - * deterministic coreference sieves. - */ - -object CorefUtil extends CorefUtil {} - - -class CorefUtil { - - - /* - * Collections of words - */ - - val locationModifiers: Set[String] = Set[String]("east", "west", "north", "south", "eastern", "western", "northern", "southern", "upper", "lower", "northeastern", "northwestern", "southeastern", "southwestern") - val indefiniteArticles: HashSet[String] = HashSet[String]("a", "an") - val indefinitePronouns: HashSet[String] = HashSet[String]("another", "anybody", "anyone", "anything", "each", "either", "enough", "everything", "less", "little", "much", "neither", "nobody", "no-one", "nothing", "one", "other", "somebody", "someone", "something", "both", "few", "fewer", "many", "others", "several", "all", "any", "more", "most", "none", "some", "such") - val stopWordsList: HashSet[String] = HashSet[String]("the", "that", "this", "mr.", "miss", "mrs.", "dr.", "ms.", "inc.", "ltd.", "corp.", "'s") - val firstPersonPronouns: HashSet[String] = HashSet[String]("i", "we", "me", "us", "'s", "my", "myself", "mine", "our", "ours", "ourself", "ourselves") - val secondPersonPronouns: HashSet[String] = HashSet[String]("you", "your", "yours","yourself", "yourselves") - val thirdPersonPersonalPronouns: HashSet[String] = HashSet[String]("he", "his", "him", "himself", "she", "her", "hers", "herself", "they", "their", "theirs", "them", "themself", "themselves","'em") - val personalPronouns: HashSet[String] = firstPersonPronouns ++ secondPersonPronouns ++ thirdPersonPersonalPronouns - val animatePronouns: HashSet[String] = personalPronouns ++ HashSet[String]("one", "oneself", "one's","who", "whom", "whose") - val reflexivePronouns: HashSet[String] = HashSet[String]("myself", "ourself", "ourselves", "yourself", "yourselves", "himself", "herself", "itself", "themself", "themselves") - - /** - * incompatibileCache is a HashSet that records which mentions are incompatible. - * If mentionA is incompatible with mentionB, the concatenation of mentionA.uniqueId and mentionB.uniqueId - * will be present in the cache. The cache does have a size limit and once filled, no new entries will be - * placed in the cache. This limit is controlled by the variable incompatibleCacheMaxSize, which defines - * the number of records stored in the cache. - */ - private val incompatibleCache: HashSet[String] = new HashSet[String]() - - /** - * The maximum number of items which will be stored in the incompatibility cache. - * Set to -1 for no limit. - */ - val incompatibleCacheMaxSize: Int = 20000 - - /** - * Analogous to the incompatibleCache, but for mentions which are compatible with one another. - */ - private val compatibleCache: HashSet[String] = new HashSet[String]() - - /** - * The maximum number of items which will be stored in the compatible cache. - * Set to -1 for no limit. - */ - val compatibleCacheMaxSize: Int = 20000 - - - /** - * Similar to incompatibileCache, invalidPronounDistanceCache, stores which pronominal mentions - * are too far from other mentions for resolution. It stores this as a HashSet of strings of the - * form: mentionA.uniqueId concatenated with mentionB.uniqueId, where mentionA is a pronominal reference - * and mentionB is a reference that is too many sentences away in the document. The cache does have a size limit - * and once filled, no new entries will be placed in the cache. This limit is controlled by the variable - * invalidPronounDistanceCacheMaxSize, which defines the number of records stored in the cache. - */ - private val invalidPronounDistanceCache: HashSet[String] = new HashSet[String]() - - /** - * The maximum number of items which will be stored in invalidPronounDistanceCache. - * Set to -1 for no limit. - */ - val invalidPronounDistanceCacheMaxSize: Int = 20000 - - - /** - * Analogous to the invalid pronoun distance cache, but for mentions which satisfy the distance requirement - */ - private val validPronounDistanceCache: HashSet[String] = new HashSet[String]() - - /** - * The maximum number of items which will be stored in validPronounDistanceCache. - * Set to -1 for no limit. - */ - val validPronounDistanceCacheMaxSize: Int = 20000 - - - /** - * The current mention is considered incompatible with the candidate antecedent if ANY of the following conditions hold: - * - One mention is a substring of the other mention - * - The mentions are in an "i-within-i" relationship - * - if the current mention is "this" and the candidate antecedent is farther than three sentences away - * - if the candidate antecedent is a second person pronoun - * - if the current mention is a bare plural (a plural common noun without a determiner or modifier) - * - if the mentions are in a subject object relationship - * @param currentMention - * @param candidateAntecedent - * @return - */ - def incompatible(currentMention: Mention, candidateAntecedent: Mention): Boolean = { - - // The string which is used to do a lookup in the cache - val cacheString: String = currentMention.uniqueId ++ candidateAntecedent.uniqueId - - // check cached values - if (incompatibleCache.contains(cacheString)) { - return true - } else if (compatibleCache.contains(cacheString)) { - return false - } else { - - // Check each of the compatibility criteria - val cmHeadString = currentMention.phrase.headToken.string.trim.toLowerCase - val caHeadString = candidateAntecedent.phrase.headToken.string.trim.toLowerCase - - - if (liesWithin(currentMention, candidateAntecedent) || - liesWithin(candidateAntecedent, currentMention) || // either mention lies within the other. - (isIWithinI(currentMention, candidateAntecedent) && !currentMention.phrase.isAppositionOf(candidateAntecedent.phrase) && !CorefFeatures.isRelativeFor(currentMention, candidateAntecedent)) || - (isIWithinI(candidateAntecedent, currentMention) && !candidateAntecedent.phrase.isAppositionOf(currentMention.phrase) && !CorefFeatures.isRelativeFor(candidateAntecedent, currentMention))|| // either mention is in an i-within-i relationship with other and neither is in an appositive or relative pronoun construction - isBarePlural(currentMention) || // mention is bare plural - ( (!reflexivePronouns.contains(cmHeadString) && !reflexivePronouns.contains(caHeadString)) && hasSubjectObjectRelationship(currentMention, candidateAntecedent))) { // mentions are in a subject-object relationship and neither is a reflexive pronoun - - if (incompatibleCache.size < incompatibleCacheMaxSize || incompatibleCacheMaxSize == -1) { - incompatibleCache.add(cacheString) - } - return true - } else { - if (compatibleCache.size < compatibleCacheMaxSize || compatibleCacheMaxSize == -1) { - compatibleCache.add(cacheString) - } - return false - } - } - } - - /** - * Returns true if the current mention is a first or second person pronoun and the number of sentences separating the two mentions is more than 3 - * or if the currentMention is "this" and the sentence distance is more than three. - * @param currentMention - * @param candidateAntecedent - * @return - */ - def incompatiblePronounMatch(currentMention: Mention, candidateAntecedent: Mention): Boolean = { - val cacheString: String = currentMention.uniqueId ++ candidateAntecedent.uniqueId - if (invalidPronounDistanceCache.contains(cacheString)) { - return true - } else if (validPronounDistanceCache.contains(cacheString)) { - return false - } else { - val cmHeadString = currentMention.phrase.headToken.string.trim.toLowerCase - if ((firstPersonPronouns.contains(cmHeadString) || secondPersonPronouns.contains(cmHeadString) || cmHeadString.equals("this")) && sentenceDistance(currentMention, candidateAntecedent) > 3) { - if (invalidPronounDistanceCache.size < invalidPronounDistanceCacheMaxSize || invalidPronounDistanceCacheMaxSize == - 1) { - invalidPronounDistanceCache.add(cacheString) - } - return true - } else { - if (validPronounDistanceCache.size < validPronounDistanceCacheMaxSize || validPronounDistanceCacheMaxSize == - 1) { - validPronounDistanceCache.add(cacheString) - } - return false - } - } - } - - - /** - * Returns true if a mention in cluster1 is incompatible with some mention in cluster2 - * @param cluster1 - * @param cluster2 - * @return - */ - def incompatible(cluster1: MentionCluster, cluster2: MentionCluster): Boolean = { - for (m1 <- cluster1.mentions) { - for (m2 <- cluster2.mentions) { - if (incompatible(m1,m2)) { - - // Make all of the mentions in two clusters incompatible - for (m3 <- cluster1.mentions) { - for (m4 <- cluster2.mentions) { - val cacheString: String = m3.uniqueId ++ m4.uniqueId - if (compatibleCache.contains(cacheString)) { - compatibleCache.remove(cacheString) - } - if (incompatibleCache.size < incompatibleCacheMaxSize || incompatibleCacheMaxSize == -1) { - incompatibleCache.add(cacheString) - } - } - } - - return true - } - } - } - return false - } - - /** - * Given a document, remove any mention from the internal within-doc coref structure - * if that mention lies within a larger mention. - * - * For example, if mention A has a span of tokens 30-36, mention B has a span of tokens 31-33, and mention C has - * a span of tokens 30-34. This function will remove mentions B and C. - * @param document - * @return - */ - def removeLiesWithinMentions(document: Document): Unit = { - for (m1 <- document.getCoref.mentions) { - for (m2 <- document.getCoref.mentions) { - if (m1.uniqueId != m2.uniqueId && liesWithin(m1,m2)) { - if ( (m1.phrase.end - m1.phrase.start) > (m2.phrase.end - m2.phrase.start)) { - document.getCoref.deleteMention(m2) - } else { - document.getCoref.deleteMention(m1) - } - } - } - } - } - - /** - * Given a document, remove any mention from the internal within-doc coref structure - * if that mention lies within a larger mention and both mentions have the same head word. - * @param document - * @return - */ - def removeLiesWithinMentionsWithSameHead(document: Document): Unit = { - for (m1 <- document.getCoref.mentions) { - for (m2 <- document.getCoref.mentions) { - if (m1.uniqueId != m2.uniqueId && liesWithinWithSameHeadWord(m1,m2)) { - if ( (m1.phrase.end - m1.phrase.start) > (m2.phrase.end - m2.phrase.start)) { - document.getCoref.deleteMention(m2) - } else { - document.getCoref.deleteMention(m1) - } - } - } - } - } - - /** - * Returns true if the span of m1 is contained by m2. - * @param m1 - * @param m2 - * @return - */ - def liesWithin(m1: Mention, m2: Mention): Boolean = { - inTheSameSentence(m1, m2) && m1.phrase.start >= m2.phrase.start && m1.phrase.end <= m2.phrase.end - } - - - /** - * Returns true if the span of m1 is contained by m2 and the mentions have the same head words - * @param m1 - * @param m2 - * @return - */ - def liesWithinWithSameHeadWord(m1: Mention, m2: Mention): Boolean = { - inTheSameSentence(m1, m2) && m1.phrase.start >= m2.phrase.start && m1.phrase.end <= m2.phrase.end && m1.phrase.headToken == m2.phrase.headToken - } - - - /** - * Returns true if m1 is a descendant of m2 in the parse tree - */ - def isIWithinI(m1: Mention, m2: Mention): Boolean = { - var curr: Token = m1.phrase.headToken - val m2head: Token = m2.phrase.headToken - var i_within_i: Boolean = false - if (inTheSameSentence(m1,m2)) { - while (curr != null && !i_within_i) { - i_within_i = curr.positionInSentence == m2head.positionInSentence - curr = curr.parseParent - } - } - i_within_i - } - - - - /** - * Returns the number of sentences (absolute value) between the two mentions - * @param m1 - * @param m2 - * @return - */ - def sentenceDistance(m1: Mention, m2: Mention): Int = { - if (m1.attr.contains(classOf[DeterministicCorefCache]) && m1.attr.contains(classOf[DeterministicCorefCache])) - return math.abs(m1.attr[DeterministicCorefCache].absoluteSentenceNumber - m2.attr[DeterministicCorefCache].absoluteSentenceNumber) - else - return math.abs(absoluteSentenceNumber(m1) - absoluteSentenceNumber(m2)) - } - - - /** - * Returns the sentence number of the mention within the document. Note that this is the number of sentences which - * appear before the sentence in which the mention occurs in the document. This differs from the field of the sentence - * object, indexInSection, which returns the sentence's position in its section. - * @param mention - * @return - */ - def absoluteSentenceNumber(mention: Mention):Int = { - val mSection: Section = mention.phrase.section - val mSectionIdx: Int = mSection.indexInDocument - var result: Int = -1 - if (mSectionIdx == 0) { - result = mention.phrase.sentence.indexInSection - } else { - var numSentenceInSectionsBeforeM: Int = mention.phrase.sentence.indexInSection - val allSections: Seq[Section] = mSection.document.sections - for (i <- 0 to mSectionIdx) { - val numSentences: Int = allSections(i).sentences.length - numSentenceInSectionsBeforeM += numSentences - } - result = mention.phrase.sentence.indexInSection + numSentenceInSectionsBeforeM - - } - return result - } - - - /** - * Returns true iff m1 and m2 appear in the same sentence - * @param m1 - * @param m2 - * @return - */ - def inTheSameSentence(m1: Mention, m2: Mention): Boolean = { - return 0 == sentenceDistance(m1,m2) - } - - - /** - * Returns true if the mention's head token has a parse label of subject (i.e. either csubj, nsubj, csubjpass, or nsubjpass) - * @param m - * @return - */ - def isSubject(m: Mention): Boolean = { - val mentionParseLabel: Int = m.phrase.headToken.parseLabel.intValue - return (mentionParseLabel == ParseTreeLabelDomain.csubj || mentionParseLabel == ParseTreeLabelDomain.csubjpass - || mentionParseLabel == ParseTreeLabelDomain.nsubj || mentionParseLabel == ParseTreeLabelDomain.nsubjpass) - } - - /** - * Returns true if mention's head token has a parse label of object (i.e. either dobj or iobj) - * @param m - * @return - */ - def isObject(m: Mention): Boolean = { - val mentionParseLabel: Int = m.phrase.headToken.parseLabel.intValue - return (mentionParseLabel == ParseTreeLabelDomain.dobj || mentionParseLabel == ParseTreeLabelDomain.iobj) // One could consider including pobj here, I found results were better without it - } - - - - /** - * Returns true if one of the mentions is the subject of a particular verb and the other mention an object of the same verb - * @param m1 - * @param m2 - * @return - */ - def hasSubjectObjectRelationship(m1: Mention, m2: Mention): Boolean = { - if (inTheSameSentence(m1, m2)) { - val m1HeadToken: Token = m1.phrase.headToken - val m2HeadToken: Token = m2.phrase.headToken - if ((isSubject(m1) && isObject(m2)) || (isSubject(m2) && isObject(m1))) { - // find the verb of each and make sure that they are the same: - var tmp: Token = m1HeadToken - var m1HeadVerb: Token = null - var found: Boolean = false - while (tmp != null && !found) { - if (tmp.attr[PennPosTag].isVerb) { - m1HeadVerb = tmp - found = true - } else { - tmp = tmp.parseParent - } - } - tmp = m2HeadToken - var m2HeadVerb: Token = null - found = false - while (tmp != null && !found) { - if (tmp.attr[PennPosTag].isVerb) { - m2HeadVerb = tmp - found = true - } else { - tmp = tmp.parseParent - } - } - if (m1HeadVerb == m2HeadVerb) { - return true - } - } - } - return false - } - - - /** - * Returns DCorefPersonDomain.FIRST if the mention is a first person pronoun, DCorefPersonDomain.SECOND if the mention is a second person pronoun - * else returns DCorefPersonDomain.THIRD. Note that "I"s and "You"s appearing in quotation marks are considered third person - * @param mention - * @return - */ - def getPerson(mention: Mention): Int = { - // if pronoun is "I" & "you" or relative return false - val currentMentionHead: Token = mention.phrase.headToken - val currentMentionHeadString: String = currentMentionHead.lemmaString.trim.toLowerCase - if (CorefFeatures.relativizers.contains(currentMentionHeadString)) { - return DCorefPersonDomain.THIRD - } - var firstQuote: Boolean = true - var firstQuoteIdx: Int = -1 - if (firstPersonPronouns.contains(currentMentionHeadString) || secondPersonPronouns.contains(currentMentionHeadString)) { - val currentMentionSentenceTokens: Seq[Token] = mention.phrase.sentence.tokens - for (token <- currentMentionSentenceTokens) { - if (token.string.trim == "\"" || token.string.trim == "``"|| token.string.trim == "''") { - if (firstQuote) { - firstQuoteIdx = token.positionInSection - firstQuote = false - } else { - if (token.positionInSection > currentMentionHead.positionInSection && firstQuoteIdx < currentMentionHead.positionInSection) { - return DCorefPersonDomain.THIRD - } - firstQuote = true - } - } - } - if (firstPersonPronouns.contains(currentMentionHeadString)) - return DCorefPersonDomain.FIRST - else - return DCorefPersonDomain.SECOND - } else { - return DCorefPersonDomain.THIRD - } - } - - /** - * Returns true if the two mentions to not have differing modifiers that fall into the set of "location modifiers", which are - * words such as "north" or "southeast" - * @param m1 - * @param m2 - * @return - */ - def agreesInLocation(currentMention: Mention, candidateAntecedent: Mention): Boolean = { - val currentMentionModifiers: Set[String] = currentMention.phrase.tokens.toSeq.filter(x => (x.attr[PennPosTag].isNoun || x.attr[PennPosTag].isAdjective) && x != (currentMention.phrase.headToken)).map(_.string.trim.toLowerCase).toSet - val candidateAntecedentModifiers: Set[String] = candidateAntecedent.phrase.tokens.toSeq.filter(x => (x.attr[PennPosTag].isNoun || x.attr[PennPosTag].isAdjective) && x != (candidateAntecedent.phrase.headToken)).map(_.string.trim.toLowerCase).toSet - if (currentMentionModifiers.intersect(locationModifiers) != candidateAntecedentModifiers.intersect(locationModifiers)) { - return false - } - return true - - } - - /** - * Returns true if every modifier of the current mention is present in the antecedent. Additionally, any modifier of either mention - * which is in the set of location modifiers (words such as "north or southwest") must be present in both mentions. Note how - * this definition is not symmetric; the antecedent's modifiers (except for the location modifiers) need not be in mention. - * @param currentMention - * @param candidateAntecedent - * @param cm - * @return - */ - def hasCompatibleModifiersOnly(currentMention: Mention, candidateAntecedent: Mention, cm: MentionClusterManager): Boolean = { - // Check that current mention does not have additional modifier that antecedent doesn't have - // Check that the location modifiers are consistent - val currentMentionModifiers: Set[String] = currentMention.phrase.tokens.toSeq.filter(x => (x.attr[PennPosTag].isNoun || x.attr[PennPosTag].isAdjective || x.attr[PennPosTag].categoryValue == "CD") && x != (currentMention.phrase.headToken)).map(_.string.trim.toLowerCase).toSet - val candidateAntecedentModifiers: Set[String] = candidateAntecedent.phrase.tokens.toSeq.filter(x => (x.attr[PennPosTag].isNoun || x.attr[PennPosTag].isAdjective || x.attr[PennPosTag].categoryValue == "CD") && x != (candidateAntecedent.phrase.headToken)).map(_.string.trim.toLowerCase).toSet - if (currentMentionModifiers.diff(candidateAntecedentModifiers).size != 0) { - return false - } - if (currentMentionModifiers.intersect(locationModifiers) != candidateAntecedentModifiers.intersect(locationModifiers)) { - return false - } - return true - - } - - /** - * Returns true if every mention in mentionCluster1 has compatible modifiers with every mention in mentionCluster2 - * @param mentionCluster1 - * @param mentionCluster2 - * @param cm - * @return - */ - def hasCompatibleModifiersOnly(mentionCluster1: MentionCluster, mentionCluster2: MentionCluster, cm: MentionClusterManager): Boolean = { - for (m1 <- mentionCluster1.mentions) { - for (m2 <- mentionCluster2.mentions) { - if (!hasCompatibleModifiersOnly(m1, m2, cm)) { - return false - } - } - } - return true - } - - - /** - * Returns true if the mention is a plural common noun without any modifiers or determiners - * @param currentMention - * @return - */ - def isBarePlural(currentMention: Mention): Boolean = { - val currentMentionHead: Token = currentMention.phrase.headToken - val currentMentionHeadPos: PennPosTag = currentMentionHead.attr[PennPosTag] - return (currentMentionHeadPos.categoryValue == "NNS" && currentMention.phrase.tokens.length == 1) - } - - - /** - * Returns true if all the modifiers in a given phrase which are labeled "CD" in the parse - * agree with those in other mention - * @param currentMention - * @param candidateAntecedent - * @return - */ - def agreementBetweenModifiersWhichAreNumbers(currentMention: Mention, candidateAntecedent: Mention): Boolean = { - val currentMentionModifiers: Set[String] = currentMention.phrase.tokens.toSeq.filter(x => (x.attr[PennPosTag].categoryValue == "CD") && x != (currentMention.phrase.headToken)).map(_.string.trim.toLowerCase).toSet - val candidateAntecedentModifiers: Set[String] = candidateAntecedent.phrase.tokens.toSeq.filter(x => (x.attr[PennPosTag].categoryValue == "CD") && x != (candidateAntecedent.phrase.headToken)).map(_.string.trim.toLowerCase).toSet - if (currentMentionModifiers.diff(candidateAntecedentModifiers).size != 0) { - return false - } - return true - } - - - /** - * Ranks m1 and m2. The ranking is based on the following criteria: - * 1. Proper nouns are more representative than common nouns, and common nouns more representative than pronouns - * 2. If they are both proper, common or pronoun, the following attributes of the mentions are used for ranking - * - Distance of head to start of phrase (larger distance better) - * - Section of mention (lower index is better) - * - Sentence position in section (lower index is better) - * - Head position in Sentence (earlier is better) - * - Length of mention (if len < 5, shorter is better, else longer is better) - * @param m1 - * @param m2 - * @return The more representative of the two mentions - */ - def moreRepresentativeOf(m1: Mention, m2: Mention): Mention = { - - // First check the part of speech - if ((m1.attr[MentionCharacteristics].isProper) && (m2.attr[MentionCharacteristics].isPRO || m2.attr[MentionCharacteristics].isNoun)) - return m1 - else if ((m1.attr[MentionCharacteristics].isNoun) && (m2.attr[MentionCharacteristics].isPRO)) - return m1 - if ((m2.attr[MentionCharacteristics].isProper) && (m1.attr[MentionCharacteristics].isPRO || m1.attr[MentionCharacteristics].isNoun)) - return m2 - else if ((m2.attr[MentionCharacteristics].isNoun) && (m1.attr[MentionCharacteristics].isPRO)) - return m2 - - // Determine value for each tie-breaker - val m1RankingAttributes: Seq[Int] = Seq[Int](-m1.phrase.headToken.positionInSentence + m1.phrase.start, m1.phrase.section.indexInDocument, m1.phrase.sentence.indexInSection, m1.phrase.headToken.positionInSentence, math.max(0, m1.phrase.tokens.length - 5), m1.phrase.tokens.length) - val m2RankingAttributes: Seq[Int] = Seq[Int](-m2.phrase.headToken.positionInSentence + m2.phrase.start, m2.phrase.section.indexInDocument, m2.phrase.sentence.indexInSection, m2.phrase.headToken.positionInSentence, math.max(0, m2.phrase.tokens.length - 5), m2.phrase.tokens.length) - - for (i <- 0 to m1RankingAttributes.length - 1 by 1) { - if (m1RankingAttributes(i) < m2RankingAttributes(i)) { - return m1 - } else if (m1RankingAttributes(i) > m2RankingAttributes(i)) { - return m2 - } - } - - // As a final tie breaker, we just sort the mentions. In testing, this case was never hit - if (m1.phrase.string <= m2.phrase.string) - return m1 - else - return m2 - } - - /** - * Returns the depth of the mention in the parse tree - * @param mention - * @return - */ - def depthOfMention(mention: Mention): Int = { - var depth: Int = 0 - var tmp: Token = mention.phrase.headToken - while (tmp.parseParent ne null) { - tmp = tmp.parseParent - depth+=1 - } - return depth - } - - /** - * Returns the animacy of a mention - * @param mention - * @return - */ - def getAnimacy(mention: Mention): Int = { - - // Check if Animate pronoun - if (animatePronouns.contains(mention.phrase.headToken.string.trim.toLowerCase)) { - return DCorefAnimacyDomain.ANIMATE - } else if (mention.phrase.attr.contains(classOf[OntonotesPhraseEntityType])) { - if (mention.phrase.attr[OntonotesPhraseEntityType].intValue == OntonotesEntityTypeDomain.PERSON) { - return DCorefAnimacyDomain.ANIMATE - } else if (!(mention.phrase.attr[OntonotesPhraseEntityType].intValue == OntonotesEntityTypeDomain.MISC || mention.phrase.attr[OntonotesPhraseEntityType].intValue == OntonotesEntityTypeDomain.O)) { - return DCorefAnimacyDomain.INANIMATE - } - } - return DCorefAnimacyDomain.UNKNOWN - } - - /** - * Removes all entries from the various caches of the CorefUtil - */ - def clearCaches(): Unit = { - incompatibleCache.clear() - compatibleCache.clear() - invalidPronounDistanceCache.clear() - validPronounDistanceCache.clear() - } - - /** - * Returns the mention string with determiners removed - * @param m - * @return - */ - def mentionStringWithoutDeterminer(m: Mention): String = { - var res: String = "" - for (token <- m.phrase.tokens) { - if (token.attr[PennPosTag].categoryValue != "DT" ||token.attr[PennPosTag].categoryValue != "WDT" || token.attr[PennPosTag].categoryValue != "PDT") { - res = res + token.string + " " - } - } - return res.trim - } - - - /** - * Removes any phrases starting with a comma or a WDT that appear after the head token, returns the resulting - * string of the mention. - * @param mention - * @return - */ - def relaxString(mention: Mention): String = { - val mentionHead: Token = mention.phrase.headToken - var mentionString: String = "" - val mentionHeadIdx: Int = mentionHead.positionInSection - mention.phrase.start - var idx: Int = 0 - for (token <- mention.phrase.tokens) { - if ((idx > mentionHeadIdx) && (token.string.trim == "," || token.attr[PennPosTag].categoryValue == "WDT" || token.attr[PennPosTag].categoryValue == "WP" || token.attr[PennPosTag].categoryValue == "WP$" || token.attr[PennPosTag].categoryValue == "WRB")) { - return mentionString.trim - } else { - mentionString += token.string.trim + " " - } - idx += 1 - } - return mentionString.trim - } - -} - - - - -/** - * An implementation of a deterministic coreference system - */ -object DeterministicCoref extends DocumentAnnotator { - - // todo fix this - @deprecated("This exists to preserve prior behavior, it should be a constructor argument", "10/5/15") - val lexicon = new StaticLexicons()(LexiconsProvider.classpath()) - - private val CFUtil: CorefUtil = new CorefUtil() - - // The ordered list of sieves used. The sieves will be applied in the order the appear in the list - // Note: To turn debug information on pass a directory name as the second argument to the sieves you wish to debug, i.e. PreciseConstructSieve(CFUtil, "debug") - private val _sieves: List[Sieve] = List(new ExactMatchSieve(CFUtil), new RelaxedStringMatchSieve(CFUtil), new PreciseConstructionsSieve(CFUtil, "", lexicon), new StrictHeadMatchingSieve(CFUtil), new StrictHeadMatchingSieveVar1(CFUtil), new StrictHeadMatchingSieveVar2(CFUtil), new AliasSieve(CFUtil), new RelaxedHeadMatchingSieve(CFUtil), new LexicalChainSieve(CFUtil), new PronounSieve(CFUtil)) - - // A sorted version of the mentions. - private var _sorted_mentions: Seq[Mention] = null - - // PreReq Attributes: - def prereqAttrs: Seq[Class[_]] = ParseBasedPhraseFinder.prereqAttrs.toSeq ++ Seq(classOf[PennPosTag]) - - // Adds the WithinDocCoref attribute - def postAttrs = Seq(classOf[WithinDocCoref]) - - val options: CorefOptions = new CorefOptions - - - - /** - * Generates the list of potential mentions within the Document (doc) - * and adds each of these mentions to the WithinDocCoref object (coref) - * Note that it adds each of these mentions with the attr[MentionCharacteristics] - * which is used by the different sieves. - */ - private def annotateMentions(doc: Document): Unit = { - - // Mention detection method number 1 - if (false) { - if (doc.coref.mentions.isEmpty) (ConllPhraseFinder(doc) ++ PronounFinder(doc) ++ NnpPosNounPhraseFinder(doc) ++ AcronymNounPhraseFinder(doc)).distinct.foreach(phrase => doc.getCoref.addMention(phrase)) - doc.coref.mentions.foreach(mention => NounPhraseEntityTypeLabeler.process(mention.phrase)) - doc.coref.mentions.foreach(mention => NounPhraseGenderLabeler.process(mention.phrase)) - doc.coref.mentions.foreach(mention => NounPhraseNumberLabeler.process(mention.phrase)) - doc.coref.mentions.foreach(mention => mention.attr += new MentionCharacteristics(mention, lexicon)) - } else { - - // if the document has not been parsed. N.B. This only applies if you are using Gold mentions (and are using the testing framework) - if (!doc.sentences.isEmpty && doc.sentences.toSeq(0).parse == null) { - doc.sentences.foreach(OntonotesTransitionBasedParser.process) - } - - // Parse based mention detection - if (doc.getCoref.mentions == null || doc.getCoref.mentions.isEmpty) { - val phraseMentions: Seq[Phrase] = ParseBasedPhraseFinder.getPhrases(doc) - for (phrase <- phraseMentions.distinct) { - doc.getCoref.addMention(phrase) - } - CFUtil.removeLiesWithinMentionsWithSameHead(doc) - } - - // Add mention attributes - for (mention <- doc.getCoref.mentions) { - mention.attr += new MentionCharacteristics(mention, lexicon) - mention.attr += new DeterministicCorefCache(mention) - } - - // Label phrases - doc.getCoref.mentions.foreach(mention => NounPhraseEntityTypeLabeler.process(mention.phrase)) - doc.getCoref.mentions.foreach(mention => NounPhraseGenderLabeler.process(mention.phrase)) - doc.getCoref.mentions.foreach(mention => NounPhraseNumberLabeler.process(mention.phrase)) - } - - } - - - - - - /** - * Determine the ordering of the candidate antecedents for each mention. The order of the antecedents - * is determined by the position of the mentions in the parse tree. For all mentions, antecedents are considered - * in reverse sentence order from the mention. For pronominal mentions, the order of antecedents is always from - * shallowest to deepest in the parse tree, earliest appearing in the sentence to latest appearing in the sentence. - * For nominal mentions, antecedents in the same sentence are ordered shallowest to deepest, earliest to latest and - * antecedents in previous sentences are ordered shallowest to deepest, latest to earliest. - * - * The ordering is sorted as a map from the uniqueId of a mention to a list of integers which are the index - * into the set of mentions for the document for the antecedents. - * @param document - * @return - */ - def determineOrdering(document: Document): HashMap[String, List[Int]] = { - // For each mention we want to find it's score in each sentence - val DEPTH_CONSTANT: Int = 10000 - val START_CONSTANT: Int = 100 - val END_CONSTANT: Int = 1 - // Find the Left - to - Right first ordering of the mentions in each sentence - - val sentenceNumber2LROrdering: HashMap[Int,Seq[Tuple2[Int,Int]]] = HashMap[Int,Seq[Tuple2[Int,Int]]]() - var idx:Int = 0 - for (mention <- _sorted_mentions) { - val mSentNo: Int = mention.attr[DeterministicCorefCache].absoluteSentenceNumber - val mentionScore: Int = CFUtil.depthOfMention(mention)*DEPTH_CONSTANT + mention.phrase.start*START_CONSTANT + mention.phrase.end*END_CONSTANT - - if (!sentenceNumber2LROrdering.contains(mSentNo)) { - sentenceNumber2LROrdering += (mSentNo -> Seq(Tuple2[Int,Int](idx,mentionScore))) - } else { - sentenceNumber2LROrdering(mSentNo) = sentenceNumber2LROrdering(mSentNo) :+ Tuple2[Int,Int](idx,mentionScore) - } - idx += 1 - } - - for (key <- sentenceNumber2LROrdering.keys) { - sentenceNumber2LROrdering.update(key, sentenceNumber2LROrdering(key).sortBy(m => (m._2,m._1))) - } - - var ordering: HashMap[String, List[Int]] = HashMap[String, List[Int]]() - for (mention <- _sorted_mentions) { - val mentionSentenceNo: Int = mention.attr[DeterministicCorefCache].absoluteSentenceNumber - var candidateOrdering: Seq[Int] = sentenceNumber2LROrdering(mentionSentenceNo).map(_._1) - if (mention.attr[MentionCharacteristics].isPRO) { - for (i <- mentionSentenceNo - 1 to 0 by -1) - if (sentenceNumber2LROrdering.contains(i)) - candidateOrdering = candidateOrdering ++ sentenceNumber2LROrdering(i).map(_._1) - } else { - for (i <- mentionSentenceNo - 1 to 0 by -1) - if (sentenceNumber2LROrdering.contains(i)) - candidateOrdering = candidateOrdering ++ sentenceNumber2LROrdering(i).map(_._1).reverse - } - ordering += (mention.uniqueId -> candidateOrdering.toList) - } - return ordering - } - - - def tokenAnnotationString(token: Token): String = { - val entities = token.document.getCoref.entities.toSeq - _sorted_mentions.find(m => m.phrase.contains(token)) match { - case Some(mention) => - val mtokens = mention.phrase.tokens - if (mtokens.length == 1) "(" + entities.indexOf(mention.entity) + ")" - else if (mtokens.indexOf(token) == 0) "(" + entities.indexOf(mention.entity) - else if (mtokens.indexOf(token) == mtokens.length - 1) entities.indexOf(mention.entity) + ")" - else "_" - case None => "_" - } - } - - /** - * Find and resolve mentions in a document - */ - def process(document: Document) = { - - - CFUtil.clearCaches() - - //val startTime = java.lang.System.currentTimeMillis() - - // Find all mentions in the document - annotateMentions(document) - - // Reset the entity attributes of the mentions, in the case we are using gold mentions - document.getCoref.resetPredictedMapping() - - // A list of mentions sorted by their position in the document. - _sorted_mentions = document.getCoref.mentions.sortBy(m => (m.phrase.start, m.phrase.end)) - - // Determine the ordering of antecedents for the mentions - val ordering: HashMap[String, List[Int]] = determineOrdering(document) - - // a new mention cluster manager object. - val cm = new MentionClusterManager(_sorted_mentions, CFUtil) - - // Use each of the sieves to resolve the mentions - for (sieve <- _sieves) { - sieve.resolveMentions(_sorted_mentions, ordering, cm, document) - } - - /* Convert the clusters into Entity groups */ - for (mention <- _sorted_mentions) { - val cluster: MentionCluster = cm.getCluster(mention) - var bestCand: Mention = cluster.firstMention - if (bestCand.entity ne null) { - bestCand.entity += mention - } else { - val entity = document.getCoref.newEntity() - entity += bestCand - entity += mention - } - } - //println("Document " + document.uniqueId + " Processed in " + ( java.lang.System.currentTimeMillis() - startTime) + " ms") - document - } - - /** - * A subclass used for evaluation purposes. Follows the similar class in ForwardCoref. - */ - class CorefTester(scorer: CorefConllOutput, scorerMutex: Object, val pool: ExecutorService) { - def map(doc: Document): Unit = { - - // Make sure that the targetCoref is there - assert(doc.targetCoref ne null, "Cannot perform test on document without test key.") - val trueCoref = doc.targetCoref - - // Reset the predicted mapping - if (doc.coref ne null) { - doc.coref.resetPredictedMapping() - } - - // process the document - process(doc) - - // remove singletons - val predCoref = doc.getCoref - predCoref.removeSingletons() - - // score - - val b3 = ClusterF1Evaluation.BCubedNoSingletons(predCoref, trueCoref) - val ce = ClusterF1Evaluation.CeafE(predCoref, trueCoref) - val muc = ClusterF1Evaluation.MUCNoSingletons(predCoref, trueCoref) - val cm = ClusterF1Evaluation.CeafM(predCoref, trueCoref) - - scorerMutex.synchronized { - scorer.microB3.microAppend(b3) - scorer.microCE.microAppend(ce) - scorer.microCM.microAppend(cm) - scorer.microMUC.microAppend(muc) - } - } - - def runParallel(ins: Seq[Document]) = cc.factorie.util.Threading.parMap(ins, pool)(map) - - def runSequential(ins: Seq[(Document)]) = ins.map(map) - } - - /** - * Perform the classification test on the set of documents passed in to the function - * @param testDocs - * @param wn - * @param name - * @return - */ - def doTest(testDocs: Seq[Document], wn: WordNet, name: String): Double = { - val scorer = new CorefConllOutput - object ScorerMutex - val pool = java.util.concurrent.Executors.newFixedThreadPool(options.numThreads) - var accuracy = 0.0 - try { - val tester = new CorefTester(scorer, ScorerMutex, pool) - tester.runSequential(testDocs) - println("-----------------------") - println(" * Overall scores") - scorer.printInhouseScore(name) - accuracy = scorer.microMUC.f1 - } finally pool.shutdown() - accuracy - } -} - - -/** - * A trait extended by each of the sieves used for debugging, defines useful methods for easily writing debug output. - */ -trait DeterministicCorefDebug { - lazy val debugOutputDir: String = "" - lazy val debugOutputFilename: String = "" - private lazy val debugOutputDirectoryFile: java.io.File = new java.io.File(debugOutputDir) - lazy val debugOutputFile: java.io.PrintWriter = {debugOutputDirectoryFile.mkdirs(); new java.io.PrintWriter(new java.io.BufferedWriter(new java.io.FileWriter(new java.io.File(debugOutputDir, debugOutputFilename), true)));} - /** - * The String which defines the HTML table format for the sieve output - */ - val debugHTMLTableStart: String = "" - /** - * Ending string of the table - */ - val debugHTMLTableEnd: String = "
" - lazy val performDebug: Boolean = {if (debugOutputDir == "" || debugOutputDir == null) false else true} - - /** - * Writes the inputted string to the debugout output file and flushes the file - * @param s - */ - def debugPrint(s: String): Unit = { - if (debugOutputFile != null && performDebug) { - debugOutputFile.print(s) - debugOutputFile.flush() - } - } - - /** - * Writes the inputted string to the debugout output file, writes a new line and flushes the file - * @param s - */ - def debugPrintln(s: String = ""): Unit = { - if (debugOutputFile != null && performDebug) { - debugOutputFile.print(s + "\n") - debugOutputFile.flush() - } - } - - /** - * Method used for formatting debug output files, creates a HTML table entry of X if the inputted boolean is true - * otherwise creates an empty table entry. - * @param b - * @return - */ - def ifTrueX(b: Boolean): String = { - if (b) return " X " else " " - } -} - - -/** - * The abstract definition of the Sieve class. The main difference between the sieves is what is used - * as the matching criteria which is defined by the matching function - */ -abstract class Sieve(CFUtil: CorefUtil, debugDirectory: String = "") extends DeterministicCorefDebug { - - /** - * A flag which indicates whether or not nominal mentions will be resolved by the sieve. The default value - * is true. - */ - val resolveNominal: Boolean = true - - /** - * A flag which indicates whether or not pronominal mentions will be resolved by the sieve. The default value is false. - */ - val resolvePronominal: Boolean = false - - /** - * The name of the sieve. - */ - val name: String = "sieve" - - /** - * A flag which indicates whether or not to force coreferent mentions to satisfy the compatibility requirements defined - * in the CorefUtil class. - */ - val restrictToCompatibleMentions: Boolean = true - - /** - * A flag which requires pronominal mentions to satisify the pronoun distance requirement specified in the CorefUtil - * class - */ - val holdPronominalMentionsToDistanceRequirement: Boolean = true - - - // Override the debug output directories - override lazy val debugOutputDir: String = debugDirectory - override lazy val debugOutputFilename: String = name + ".html" - - - /** - * The process for resolving mentions in a document. Iterate over each mention in the set of mentions comparing it to - * candidate antecedents in the order specified by the ordering parameter, resolving mentions as defined by the particular sieve's matching function - * @param mentions - the set of mentions - * @param ordering - a map from the unique id of a mention to a list of candidate antecedents represented as integers, which are indices to the set of mentions - * @param cm - the cluster manager - * @param document - the document - */ - def resolveMentions(mentions: Seq[Mention], ordering: HashMap[String, List[Int]], cm: MentionClusterManager, document: Document): Unit = { - var i: Int = 0 - var idx: Int = 0 - var not_resolved: Boolean = true - - // val startTime = java.lang.System.currentTimeMillis() - // Loop over mentions - for (mention <- mentions) { - // If the mention satisfies the pruning criteria, attempt to resolve it - if (satisfiesPruningCriteria(mention)) { - // Make sure that the mention is the first in the cluster - if (cm.isFirstInCluster(mention)) { - // Check to make sure it satisfies the nominal/pronominal restrictions - if ((!mention.attr[MentionCharacteristics].isPRO && resolveNominal) || (mention.attr[MentionCharacteristics].isPRO && resolvePronominal)) { - - // Debug output - if (performDebug) { - debugPrintln("

Attempting to Resolve Mention: " + mention.phrase.string + " from sentence #" + mention.phrase.sentence.indexInSection + "" + - " words " + (mention.phrase.start - mention.phrase.sentence.start) + "-" + (mention.phrase.end - mention.phrase.sentence.start) + - ".
The cluster of this word is: " + cm.getCluster(mention).toStringHTML + - "
The most representative element is " + cm.getCluster(mention).mostRepresentativeMention.phrase.string + "" + - "
The head token of the most representative element in the cluster is: " + cm.getCluster(mention).mostRepresentativeMention.phrase.headToken.string.trim + "" + - "
The cluster attributes are: " + cm.getCluster(mention).attributeString + "

") - debugPrintln(debugHTMLTableStart) - } - - // i is the index into the list ordering(mention.uniqueId) - i = 0 - not_resolved = true - // Iterate through possible antecedents, stop if the mention is resolved - while (i < ordering(mention.uniqueId).length && not_resolved) { - idx = ordering(mention.uniqueId)(i) - val candidate_match: Mention = mentions(idx) - - // Make sure that the mentions are compatible - if ((!restrictToCompatibleMentions || !CFUtil.incompatible(cm.getCluster(mention), cm.getCluster(candidate_match))) && - (!holdPronominalMentionsToDistanceRequirement || (!mention.attr[MentionCharacteristics].isPRO || !CFUtil.incompatiblePronounMatch(mention, candidate_match)))) { - - /* - * Make sure that the candidate antecedent (if in the same sentence, appears before the current mention) - * This is necessary because of the way in which the ordering is done -- rather than calculating - * the unique ordering for each mention, we do it on a sentence by sentence level. - */ - if (mention.phrase.start >= candidate_match.phrase.start && mention != candidate_match) { - if (matchingFunction(mention, candidate_match, cm)) { - // Update clustering if necessary - cm.mergeClusters(mention, candidate_match) - not_resolved = false - } - } - } - i += 1 - } - if (performDebug) { - debugPrintln(debugHTMLTableEnd) - } - } - } - } - } - if (performDebug) { - debugPrintln(cm.toHTMLString) - } - //println("\t" + name + " in " + ( java.lang.System.currentTimeMillis() - startTime) + " ms") - } - - /** - * Returns true only if the mention satisfies the pruning criteria. The default pruning criteria is that the - * first token in the mention string is neither an indefinite article nor indefinite pronoun. - */ - def satisfiesPruningCriteria(mention: Mention): Boolean = { - return !mention.attr[DeterministicCorefCache].isIndefiniteArticleOrPronoun - } - - /** - * Each sieve defines this as its own function for resolving mentions. - * @param currentMention - * @param candidateAntecedent - * @param cm - * @return - */ - def matchingFunction(currentMention: Mention, candidateAntecedent: Mention, cm: MentionClusterManager): Boolean - -} - - -/** - * A sieve used to resolve mentions which are an exact, but case insensitive, string match - */ -class ExactMatchSieve(CFUtil: CorefUtil, debugDirectory: String = "") extends Sieve(CFUtil, debugDirectory) { - override val name: String = "ExactMatchSieve" - - // Debug settings - override lazy val debugOutputDir: String = debugDirectory - override val debugHTMLTableStart: String = "" - - /** - * The pruning criteria does not apply to this sieve. - * @param mention - * @return - */ - override def satisfiesPruningCriteria(mention: Mention): Boolean = { - return true - } - - /** - * The matching function of the exact match sieve resolves a mention to a candidate antecedent if the mentions are a case insensitive string match - * or a case insenstive string match match with the exception of an apostrophe s - * @param currentMention - * @param candidateAntecedent - * @param cm - * @return - */ - override def matchingFunction(currentMention: Mention, candidateAntecedent: Mention, cm: MentionClusterManager): Boolean = { - - val res: Boolean = (currentMention.attr[MentionCharacteristics].lowerCaseString.equals(candidateAntecedent.attr[MentionCharacteristics].lowerCaseString) || - currentMention.attr[MentionCharacteristics].lowerCaseString.equals(candidateAntecedent.attr[MentionCharacteristics].lowerCaseString.replace(" 's", "")) || - candidateAntecedent.attr[MentionCharacteristics].lowerCaseString.equals(currentMention.attr[MentionCharacteristics].lowerCaseString.replace(" 's", ""))) - - if (performDebug) { - val start: Int = candidateAntecedent.phrase.start - candidateAntecedent.phrase.sentence.start - val end: Int = candidateAntecedent.phrase.end - candidateAntecedent.phrase.sentence.start - debugPrint("" + ifTrueX(res) + "") - } - - return res - } -} - - - -/** - * Precise Constructs Sieve, used for resolving mentions through specific grammatical constructions such as apposition as well as equivalent word forms such as demonyms and acronyms. - */ -class PreciseConstructionsSieve(CFUtil: CorefUtil, debugDirectory: String = "", lexicon:StaticLexicons) extends Sieve(CFUtil, debugDirectory) { - override val name: String = "PreciseConstructionsSieve" - - override lazy val debugOutputDir: String = debugDirectory - override val debugHTMLTableStart: String = "
Candidate Candidate Sent No Candidate Span Exact Match
" + candidateAntecedent.phrase.string + " " + candidateAntecedent.phrase.sentence.indexInSection + " " + start + "-" + end + "
" - override val resolvePronominal: Boolean = true - - /** - * The matching function of the precise constructs sieve resolves a mention to a candidate antecedent if any of the following relationships between the mentions exists: - * the mentions are in apposition of one another, the mentions are in a predicate nominative relationship, one mention is an acronym of the other mention, one mention is - * a demonym of the other relation, or one mention is a relative pronoun of the other mention. - * @param currentMention - * @param candidateAntecedent - * @param cm - * @return - */ - override def matchingFunction(currentMention: Mention, candidateAntecedent: Mention, cm: MentionClusterManager): Boolean = { - - val res1: Boolean = isAppositive(currentMention, candidateAntecedent, cm) - val res2: Boolean = isPredicateNominative(currentMention, candidateAntecedent) - val res3: Boolean = isAcronym(currentMention, candidateAntecedent) - val res4: Boolean = isDemonym(currentMention, candidateAntecedent) - val res5: Boolean = isRelativePronoun(currentMention, candidateAntecedent) - - if (performDebug) { - val start: Int = candidateAntecedent.phrase.start - candidateAntecedent.phrase.sentence.start - val end: Int = candidateAntecedent.phrase.end - candidateAntecedent.phrase.sentence.start - debugPrint("" + ifTrueX(res1) + ifTrueX(res2) + ifTrueX(res3) + ifTrueX(res4) + ifTrueX(res5) + "") - } - - return (res1 || res2 || res3 || res4 || res5) - } - - - /** - * Returns true if the two mentions are in apposition of one another and the mentions agree on all attributes (see MentionCluster class for definition of attribute agreement). - * @param currentMention - * @param candidateAntecedent - * @param cm - * @return - */ - def isAppositive(currentMention: Mention, candidateAntecedent: Mention, cm: MentionClusterManager): Boolean = { - val currentMentionCluster: MentionCluster = cm.getCluster(currentMention) - val candidateAntecedentCluster: MentionCluster = cm.getCluster(candidateAntecedent) - if (currentMentionCluster.agreesInAllAttributesWith(candidateAntecedentCluster)) { - return currentMention.phrase.isAppositionOf(candidateAntecedent.phrase) - } - return false - } - - - /** - * Returns true if the current mention and candidate antecedent are in a predicate nominative relationship. - * This is determined using the dependency tree checking that both mentions are noun phrases dependent on the verb "to be" - * @param currentMention - * @param candidateAntecedent - * @return - */ - def isPredicateNominative(currentMention: Mention, candidateAntecedent: Mention): Boolean = { - val currentMentionHead: Token = currentMention.phrase.headToken - val candidateAntecedentHead: Token = candidateAntecedent.phrase.headToken - val currentMentionSentence: Sentence = currentMention.attr[DeterministicCorefCache].mentionSentence - var predNom: Boolean = false - val currentMentionHeadParent: Token = currentMentionHead.parse.parent(currentMentionHead) - val candidateAntecedentHeadParent: Token = candidateAntecedentHead.parse.parent(candidateAntecedentHead) - val currentMentionSentenceRootChild: Token = currentMentionSentence.parse.rootChild - if (CFUtil.inTheSameSentence(currentMention, candidateAntecedent) && - (currentMentionSentenceRootChild.lemmaString == "be")) { - if (currentMentionHeadParent == null || candidateAntecedentHeadParent == null) - return false - predNom = ((currentMentionHeadParent == currentMentionSentenceRootChild) && (candidateAntecedentHeadParent == currentMentionSentenceRootChild)) - } - - return predNom - } - - - /** - * Returns true if the current mention is a relative pronoun for the candidate antecedent - * @param currentMention - * @param candidateAntecedent - * @return - */ - def isRelativePronoun(currentMention: Mention, candidateAntecedent: Mention): Boolean = { - val res: Boolean = CorefFeatures.isRelativeFor(currentMention, candidateAntecedent) - return res - - } - - /** - * Uses the MentionCharacteristics function to generate possible acronyms for both the - * current mention and candidate antecedent. If both mentions are proper nouns and one is in the list of - * possible acronyms for the other, this function returns true. - * @param currentMention - * @param candidateAntecedent - * @return - */ - def isAcronym(currentMention: Mention, candidateAntecedent: Mention): Boolean = { - var res: Boolean = false - if ((currentMention.attr[MentionCharacteristics].isProper) && (candidateAntecedent.attr[MentionCharacteristics].isProper)) { - - res = (currentMention.attr[MentionCharacteristics].acronym.contains(currentMention.attr[MentionCharacteristics].lowerCaseString) || candidateAntecedent.attr[MentionCharacteristics].acronym.contains(candidateAntecedent.attr[MentionCharacteristics].lowerCaseString)) - } - return res - } - - - /** - * Returns true if the current mention is a demonym of the candidate antecedent or vice versa - * @param currentMention - * @param candidateAntecedent - * @return - */ - def isDemonym(currentMention: Mention, candidateAntecedent: Mention): Boolean = { - val currentMentionString: String = CFUtil.mentionStringWithoutDeterminer(currentMention) - val candidateAntecedentString: String = CFUtil.mentionStringWithoutDeterminer(candidateAntecedent) - val currentMentionDemonym: String = lexicon.iesl.DemonymMap.getOrElse(currentMentionString, "") - val candidateAntecedentDemonym: String = lexicon.iesl.DemonymMap.getOrElse(candidateAntecedentString, "") - var res: Boolean = false - if (currentMentionDemonym.length > 0 && candidateAntecedentDemonym.length > 0) - res = (currentMentionDemonym == candidateAntecedentDemonym) - res = (res || currentMentionString.equalsIgnoreCase(candidateAntecedentDemonym) || currentMentionDemonym.equalsIgnoreCase(candidateAntecedentString)) - - return res - } -} - - -/** - * The parent class of the strict and relaxed head matching sieves. - * @param debugDirectory - */ -abstract class HeadMatchingSieve(CFUtil: CorefUtil, debugDirectory: String = "") extends Sieve(CFUtil, debugDirectory) { - override val name: String = "HeadMatchingSieve" - - override lazy val debugOutputDir: String = debugDirectory - - /** - * Returns true if there does not exist a word (excluding stopwords) which appears in one mention's cluster - * but not the other's - * @param currentMention - * @param candidateAntecedent - * @param cm - * @return - */ - protected def satisfiesWordInclusion(currentMention: Mention, candidateAntecedent: Mention, cm: MentionClusterManager): Boolean = { - val currentMentionCluster: MentionCluster = cm.getCluster(currentMention) - val candidateAntecedentCluster: MentionCluster = cm.getCluster(candidateAntecedent) - var currentMentionWords: HashSet[String] = HashSet() - val currentMentionHeadString: String = currentMention.attr[MentionCharacteristics].lowerCaseHead - for (t <- currentMentionCluster.allTokens) { - val t_string: String = new StringOps(t.string.trim).toLowerCase.toString - if ((!CFUtil.stopWordsList.contains(t_string) && !t_string.equalsIgnoreCase(currentMentionHeadString))) { - currentMentionWords += t_string.trim - } - } - var candidateAntecedentWords: HashSet[String] = HashSet() - for (t <- candidateAntecedentCluster.allTokens) { - val t_string: String = new StringOps(t.string.trim).toLowerCase.toString - if (!CFUtil.stopWordsList.contains(t_string)) { - candidateAntecedentWords += t_string.trim - } - } - return (0 == (currentMentionWords.diff(candidateAntecedentWords).size)) - } - -} - -/** - * The standard strict head matching sieve. It is used to resolve mentions which share the same head word. - * @param debugDirectory - */ -class StrictHeadMatchingSieve(CFUtil: CorefUtil, debugDirectory: String = "") extends HeadMatchingSieve(CFUtil, debugDirectory) { - override val name: String = "StrictHeadMatchingSieve" - - // Debug settings - override lazy val debugOutputDir: String = debugDirectory - override val debugHTMLTableStart: String = "
Candidate Candidate Sent No Candidate Span Appsositive Predicate Nominative Acronym Demonym Relative Pronoun
" + candidateAntecedent.phrase.string + " " + candidateAntecedent.phrase.sentence.indexInSection + " " + start + "-" + end + "
" - - /** - * The matching function of the standard strict head matching sieve resolves a mention to a candidate antecedent - * if the head token of the most representative element in the cluster of the current mention is an exact string match - * of a head token of one of the mentions in the candidate antecedent cluster; word inclusion between the two clusters - * holds, that is there are no non-stop words that appear in one cluster but not the other; and each mention in the two - * clusters has compatible modifiers. - * @param currentMention - * @param candidateAntecedent - * @param cm - * @return - */ - override def matchingFunction(currentMention: Mention, candidateAntecedent: Mention, cm: MentionClusterManager): Boolean = { - val res1: Boolean = isClusterHeadMatch(currentMention, candidateAntecedent, cm) - val res2: Boolean = satisfiesWordInclusion(currentMention, candidateAntecedent, cm) - val res3: Boolean = CFUtil.hasCompatibleModifiersOnly(cm.getCluster(currentMention), cm.getCluster(candidateAntecedent), cm) - - - if (performDebug) { - val start: Int = candidateAntecedent.phrase.start - candidateAntecedent.phrase.sentence.start - val end: Int = candidateAntecedent.phrase.end - candidateAntecedent.phrase.sentence.start - debugPrint("" + ifTrueX(res1) + ifTrueX(res2) + ifTrueX(res3) + ifTrueX(true) + "") - } - - return (res1 && res2 && res3) - - } - - - /** - * Returns true if the head token of the most representative element in the cluster of the current mention is an exact string match - * of a head token of one of the mentions in the candidate antecedent cluster. - * @param currentMention - * @param candidateAntecedent - * @param cm - * @return - */ - protected def isClusterHeadMatch(currentMention: Mention, candidateAntecedent: Mention, cm: MentionClusterManager): Boolean = { - val mostRepresentativeMentionInCurrentCluster: Mention = cm.getCluster(currentMention).mostRepresentativeMention - val currentMentionHeadWord: String = mostRepresentativeMentionInCurrentCluster.attr[MentionCharacteristics].lowerCaseHead - val candidateAntecedentCluster: MentionCluster = cm.getCluster(candidateAntecedent) - for (mention <- candidateAntecedentCluster.mentions) { - val candidateAntecedentHeadWord: String = mention.attr[MentionCharacteristics].lowerCaseHead - if (currentMentionHeadWord.equals(candidateAntecedentHeadWord)) { - return true - } - } - return false - } - -} - -/** - * A variant of the strict head matching sieve, which removes the constraint on compatible modifiers - * @param debugDirectory (optional) Output directory for debug information - */ -class StrictHeadMatchingSieveVar1(CFUtil: CorefUtil, debugDirectory: String = "") extends StrictHeadMatchingSieve(CFUtil, debugDirectory) { - override val name: String = "StrictHeadMatchingSieve_no_CompatibleModifiers" - - // Debug settings - override lazy val debugOutputDir: String = debugDirectory - override val debugHTMLTableStart: String = "
Candidate Candidate Head Candidate Sent No Candidate Span Cluster Head Match Satisfies Word Inclusion Compatible Modifiers Only Not i-within-i
" + candidateAntecedent.phrase.string + " " + candidateAntecedent.phrase.headToken.string + " " + candidateAntecedent.phrase.sentence.indexInSection + " " + start + "-" + end + "
" - - /** - * The matching function for this variant of the strict head matching sieve is identical to the standard string head matching sieve - * with the constraint on compatible modifiers removed. - * @param currentMention - * @param candidateAntecedent - * @param cm - * @return - */ - override def matchingFunction(currentMention: Mention, candidateAntecedent: Mention, cm: MentionClusterManager): Boolean = { - val res1: Boolean = isClusterHeadMatch(currentMention, candidateAntecedent, cm) - val res2: Boolean = satisfiesWordInclusion(currentMention, candidateAntecedent, cm) - - if (performDebug) { - val start: Int = candidateAntecedent.phrase.start - candidateAntecedent.phrase.sentence.start - val end: Int = candidateAntecedent.phrase.end - candidateAntecedent.phrase.sentence.start - debugPrint("" + ifTrueX(res1) + ifTrueX(res2) + "") - } - return (res1 && res2) - - } -} - -/** - * A variant of the strict head matching sieve, which removes the constraint on word inclusion - * @param debugDirectory (optional) Output directory for debug information - */ -class StrictHeadMatchingSieveVar2(CFUtil: CorefUtil, debugDirectory: String = "") extends StrictHeadMatchingSieve(CFUtil, debugDirectory) { - override val name: String = "StrictHeadMatchingSieve_no_wordInclusion" - override lazy val debugOutputDir: String = debugDirectory - override val debugHTMLTableStart: String = "
Candidate Candidate Head Candidate Sent No Candidate Span Cluster Head Match Satisfies Word Inclusion
" + candidateAntecedent.phrase.string + " " + candidateAntecedent.phrase.headToken.string + " " + candidateAntecedent.phrase.sentence.indexInSection + " " + start + "-" + end + "
" - - /** - * The matching function of this variant of the strict head matching sieve is identical to the standard strict head matching sieve's matching function - * with the constraint on word inclusion removed. - * @param currentMention - * @param candidateAntecedent - * @param cm - * @return - */ - override def matchingFunction(currentMention: Mention, candidateAntecedent: Mention, cm: MentionClusterManager): Boolean = { - val res1: Boolean = isClusterHeadMatch(currentMention, candidateAntecedent, cm) - val res2: Boolean = CFUtil.hasCompatibleModifiersOnly(cm.getCluster(currentMention), cm.getCluster(candidateAntecedent), cm) - - if (performDebug) { - val start: Int = candidateAntecedent.phrase.start - candidateAntecedent.phrase.sentence.start - val end: Int = candidateAntecedent.phrase.end - candidateAntecedent.phrase.sentence.start - debugPrint("" + ifTrueX(res1) + ifTrueX(res2) + "") - } - return (res1 && res2) - - } -} - -/** - * The relaxed head matching sieve is a variant of the strict head matching sieve, which involves matching the head word of a mention - * to some word in an antecedent cluster. - * @param debugDirectory (Optional) Directory for debug output - */ -class RelaxedHeadMatchingSieve(CFUtil: CorefUtil, debugDirectory: String = "") extends HeadMatchingSieve(CFUtil, debugDirectory) { - override val name: String = "RelaxedHeadMatchingSieve" - - // Debug Settings - override lazy val debugOutputDir: String = debugDirectory - override val debugHTMLTableStart: String = "
Candidate Candidate Head Candidate Sent No Candidate Span Cluster Head Match Compatible Modifiers Only
" + candidateAntecedent.phrase.string + " " + candidateAntecedent.phrase.headToken.string + " " + candidateAntecedent.phrase.sentence.indexInSection + " " + start + "-" + end + "
" - - /** - * The matching function of the relaxed head matching sieve resolves a mention to a candidate antecedent if the head token of the most representative - * item in the current mention is an exact string match to some token in the candidate antecedent cluster and the two clusters satisfy the properties of - * word inclusion (there are no non-stop words that appear in one cluster but not the other) and the two mentions have the same NER type (excluding the type of "O"). - * @param currentMention - * @param candidateAntecedent - * @param cm - * @return - */ - override def matchingFunction(currentMention: Mention, candidateAntecedent: Mention, cm: MentionClusterManager): Boolean = { - val res1: Boolean = isRelaxedClusterHeadMatch(currentMention, candidateAntecedent, cm) - val res2: Boolean = satisfiesWordInclusion(currentMention, candidateAntecedent, cm) - val res3: Boolean = equalNERType(currentMention, candidateAntecedent, cm) - - if (performDebug) { - val start: Int = candidateAntecedent.phrase.start - candidateAntecedent.phrase.sentence.start - val end: Int = candidateAntecedent.phrase.end - candidateAntecedent.phrase.sentence.start - debugPrint("" + ifTrueX(res1) + ifTrueX(res2) + ifTrueX(res3) + ifTrueX(true) + "") - } - - return (res1 && res2 && res3) - - } - - - /** - * Returns true if the head token of most representative element of the current mention cluster - * matches any word in antecedent cluster - * @param currentMention - * @param candidateAntecedent - * @param cm - * @return - */ - protected def isRelaxedClusterHeadMatch(currentMention: Mention, candidateAntecedent: Mention, cm: MentionClusterManager): Boolean = { - val currentMentionHeadWord: String = cm.getCluster(currentMention).mostRepresentativeMention.attr[MentionCharacteristics].lowerCaseHead - val candidateAntecedentCluster: MentionCluster = cm.getCluster(candidateAntecedent) - for (token <- candidateAntecedentCluster.allTokens) { - if (currentMentionHeadWord.equalsIgnoreCase(token.string.trim)) { - return true - } - } - return false - } - - /** - * Returns true if the two mentions have the same NER type and that type is not "O" - * @param currentMention - * @param candidateAntecedent - * @param cm - * @return - */ - protected def equalNERType(currentMention: Mention, candidateAntecedent: Mention, cm: MentionClusterManager): Boolean = { - // if both are named entities with same type - val currentMentionNER: Int = if (currentMention.phrase.attr.contains(classOf[OntonotesPhraseEntityType])) currentMention.phrase.attr[OntonotesPhraseEntityType].intValue else OntonotesEntityTypeDomain.O - val candidateAntecedentNER: Int = if (candidateAntecedent.phrase.attr.contains(classOf[OntonotesPhraseEntityType])) candidateAntecedent.phrase.attr[OntonotesPhraseEntityType].intValue else OntonotesEntityTypeDomain.O - - val res: Boolean = (currentMentionNER != OntonotesEntityTypeDomain.O && currentMentionNER == candidateAntecedentNER) - - if (res) - return true - else if (currentMentionNER == OntonotesEntityTypeDomain.PERSON && (candidateAntecedent.phrase.attr[Gender].intValue == GenderDomain.PERSON || candidateAntecedent.phrase.attr[Gender].intValue == GenderDomain.MALE || candidateAntecedent.phrase.attr[Gender].intValue == GenderDomain.FEMALE)) - return true - else if (candidateAntecedentNER == OntonotesEntityTypeDomain.PERSON && (currentMention.phrase.attr[Gender].intValue == GenderDomain.PERSON || currentMention.phrase.attr[Gender].intValue == GenderDomain.MALE || currentMention.phrase.attr[Gender].intValue == GenderDomain.FEMALE)) - return true - else - return false - } -} - -/** - * The sieve for resolving pronominal mentions - * @param debugDirectory - (Optional) Directory for debug output - */ -class PronounSieve(CFUtil: CorefUtil, debugDirectory: String = "") extends Sieve(CFUtil, debugDirectory) { - override val name: String = "PronounSieve" - - // Resolve only pronominal references - override val resolvePronominal: Boolean = true - override val resolveNominal: Boolean = false - - // Debug settings - override lazy val debugOutputDir: String = debugDirectory - override val debugHTMLTableStart: String = "
Candidate Candidate Sent No Candidate Span Relaxed Cluster Head Match Satisfies Word Inclusion Equal NER Type Not i-within-i
" + candidateAntecedent.phrase.string + " " + candidateAntecedent.phrase.sentence.indexInSection + " " + start + "-" + end + "
" - - - /** - * The matching function for the Pronoun Sieve resolves the current mention to a candidate antecedent if the two mentions' clusters agree on the attributes of person, gender, number, animacy and NER label. - * Please see the MentionCluster class' documentation for a specific definition of agreement. - * Additionally, personal pronouns have the added constraint that they must be resolved to mention which is specifically labeled as a person (i.e. either in the NER label or in the gender attribute) - * and that exactly agrees in number (i.e not unknown "wildcard" agreement). - * @param currentMention - * @param candidateAntecedent - * @param cm - * @return - */ - override def matchingFunction(currentMention: Mention, candidateAntecedent: Mention, cm: MentionClusterManager): Boolean = { - if (currentMention.attr[MentionCharacteristics].isPRO) { - val currentMentionCluster: MentionCluster = cm.getCluster(currentMention) - val candidateAntecedentCluster: MentionCluster = cm.getCluster(candidateAntecedent) - - - if (performDebug) { - val attributeAgreement: Seq[Boolean] = currentMentionCluster.attributeAgreement(candidateAntecedentCluster) - val start: Int = candidateAntecedent.phrase.start - candidateAntecedent.phrase.sentence.start - val end: Int = candidateAntecedent.phrase.end - candidateAntecedent.phrase.sentence.start - debugPrint("" + ifTrueX(attributeAgreement(0)) + - ifTrueX(attributeAgreement(1)) + ifTrueX(attributeAgreement(2)) + ifTrueX(attributeAgreement(3)) + ifTrueX(attributeAgreement(4)) + - ifTrueX(personalPronounAgreement(currentMention, candidateAntecedent)) + "") - } - - return currentMentionCluster.agreesInAllAttributesWith(candidateAntecedentCluster) && personalPronounAgreement(currentMention, candidateAntecedent) - } - - return false - } - - /** - * Returns true if any of the following conditions are met: - * - The current mention is a singular personal pronoun and the antecedent is labeled as a singular person either by the NER label or gender attribute - * - The current mention is a plural personal pronoun -- in this case we do not place additional restrictions on the antecedent as the antecedents often are not labeled with PERSON NER labels or gender attributes - * - The current mention is NOT a personal pronoun and the candidate antecedent is NOT labeled as a person either by the NER label or gender attribute - */ - private def personalPronounAgreement(currentMention: Mention, candidateAntecedent: Mention): Boolean = { - val candidateAntecedentNER: Int = candidateAntecedent.attr[MentionCharacteristics].predictEntityType - val candidateAntecedentGender: Int = candidateAntecedent.attr[MentionCharacteristics].genderIndex - if (currentMention.phrase.attr[Number].intValue == NumberDomain.SINGULAR && CFUtil.personalPronouns.contains(currentMention.phrase.string.trim.toLowerCase)) - return (candidateAntecedent.phrase.attr[Number].intValue == NumberDomain.SINGULAR && (!candidateAntecedent.attr[MentionCharacteristics].isPRO || CFUtil.personalPronouns.contains(candidateAntecedent.phrase.string.trim.toLowerCase)) && (candidateAntecedentGender == GenderDomain.PERSON || candidateAntecedentGender == GenderDomain.MALE || candidateAntecedentGender == GenderDomain.FEMALE || candidateAntecedentNER == OntonotesEntityTypeDomain.PERSON)) - else if (currentMention.phrase.attr[Number].intValue == NumberDomain.PLURAL && CFUtil.personalPronouns.contains(currentMention.phrase.string.trim.toLowerCase)) - return true - else if (!CFUtil.personalPronouns.contains(currentMention.phrase.string.trim.toLowerCase)) - return !(CFUtil.personalPronouns.contains(candidateAntecedent.phrase.string.trim.toLowerCase) || candidateAntecedentGender == GenderDomain.PERSON || candidateAntecedentGender == GenderDomain.MALE || candidateAntecedentGender == GenderDomain.FEMALE || candidateAntecedentNER == OntonotesEntityTypeDomain.PERSON) - else - return true - } - -} - -/** - * The Relaxed String Matching sieve is used to resolve mentions which have strings that are identical up except for phrases following the head token - */ -class RelaxedStringMatchSieve(CFUtil: CorefUtil, debugDirectory: String = "") extends Sieve(CFUtil, debugDirectory) { - - override val name: String = "RelaxedStringMatch" - - // Debug settings - override lazy val debugOutputDir: String = debugDirectory - override val debugHTMLTableStart: String = "
Candidate Candidate Cluster Attributes Candidate Sent No Candidate Span Number Gender Person Animacy NER Personal Pronoun Agreement
" + candidateAntecedent.phrase.string + " " + cm.getCluster(candidateAntecedent).toStringHTML + "" + cm.getCluster(candidateAntecedent).attributeString + - " " + candidateAntecedent.phrase.sentence.indexInSection + " " + start + "-" + end + "
" - - - /** - * The matching function of the Relaxed String matching sieve resolves the current mention to the candidate antecedent if after removing any phrases starting - * with a comma or WDT that appear after the head token the two mention strings identically match. - * @param currentMention - * @param candidateAntecedent - * @param cm - */ - override def matchingFunction(currentMention: Mention, candidateAntecedent: Mention, cm: MentionClusterManager): Boolean = { - val currentMentionRelaxedString: String = CFUtil.relaxString(cm.getCluster(currentMention).mostRepresentativeMention) - val candidateAntecedentRelaxedString: String = CFUtil.relaxString(candidateAntecedent) - val res: Boolean = (currentMentionRelaxedString.equalsIgnoreCase(candidateAntecedentRelaxedString)) - - if (performDebug) { - val start: Int = candidateAntecedent.phrase.start - candidateAntecedent.phrase.sentence.start - val end: Int = candidateAntecedent.phrase.end - candidateAntecedent.phrase.sentence.start - debugPrint("") - if (res) debugPrint("") else debugPrintln("") - debugPrintln("") - } - - return res - } - - -} - - -/** - * A sieve for resolving proper noun mentions with similar head words - * @param debugDirectory - */ -class ProperHeadWordMatchSieve(CFUtil: CorefUtil, debugDirectory: String = "") extends HeadMatchingSieve(CFUtil, debugDirectory) { - - // Name - override val name: String = "ProperHeadWordMatch" - - // Debug settings - override lazy val debugOutputDir: String = debugDirectory - override val debugHTMLTableStart: String = "
Candidate Candidate Sent No Candidate Span Relaxed String Match Relaxed String Mention Relaxed String Candidate Antecedent
" + candidateAntecedent.phrase.string + " " + candidateAntecedent.phrase.sentence.indexInSection + " " + start + "-" + end + " X " + currentMentionRelaxedString + " " + candidateAntecedentRelaxedString + "
" - - - /** - * The matching function of the ProperHeadWord sieve resolves the current mention to the candidate antecedent - if both are proper nouns, have identical head tokens, and have agreement between location and numerical modifiers - * @param currentMention - * @param candidateAntecedent - * @param cm - * @return - */ - override def matchingFunction(currentMention: Mention, candidateAntecedent: Mention, cm: MentionClusterManager): Boolean = { - val res1: Boolean = currentMention.attr[MentionCharacteristics].isProper && candidateAntecedent.attr[MentionCharacteristics].isProper - val res2: Boolean = currentMention.phrase.headToken.string.equalsIgnoreCase(candidateAntecedent.phrase.headToken.string) - val res3: Boolean = CFUtil.agreesInLocation(currentMention, candidateAntecedent) - val res4: Boolean = CFUtil.agreementBetweenModifiersWhichAreNumbers(currentMention, candidateAntecedent) - - if (performDebug) { - val start: Int = candidateAntecedent.phrase.start - candidateAntecedent.phrase.sentence.start - val end: Int = candidateAntecedent.phrase.end - candidateAntecedent.phrase.sentence.start - debugPrint("" + - ifTrueX(res1) + ifTrueX(res2) + ifTrueX(res3) + ifTrueX(res4) + "") - } - - return res1 && res2 && res3 && res4 - } -} - - -/** - * A parent class for those sieves using Semantic Similarity - * @param debugDirectory - */ -abstract class SemanticSimilaritySieve(CFUtil: CorefUtil, debugDirectory: String = "") extends Sieve(CFUtil, debugDirectory) { - /** - * A WordNet object used by the sieves - */ - lazy val wn: WordNet = WordNet - - /** - * Essentially a wrapper method to the WordNet lemmatizer. Returns the lemma - * of a given mention string - * @param mention - * @return - */ - protected def refineMentionStringForKB(mention: Mention): String = { - val mentionString: String = mention.phrase.string.trim - return wn.lemma(mentionString, "N") - } -} - - -/** - * A sieve which resolves mentions which are aliases of one another - * @param debugDirectory - */ -class AliasSieve(CFUtil: CorefUtil, debugDirectory: String = "") extends SemanticSimilaritySieve(CFUtil, debugDirectory) { - - override val name: String = "AliasSieve" - - // Debug settings - override lazy val debugOutputDir: String = debugDirectory - override val debugHTMLTableStart: String = "
Candidate Candidate Sent No Candidate Span Both Proper Nouns Head Word String Match No Location Mismatches No Number Mismatches
" + candidateAntecedent.phrase.string + " " + candidateAntecedent.phrase.sentence.indexInSection + " " + start + "-" + end + "
" - - - /** - * The matching function of the Alias Sieve resolves the current mention to a candidate antecedent mention if the most representative mention in the cluster of each is a proper noun - * and either in the same synset in WordNet or labeled as aliases by the CorefFeatures.canBeAliases function. - * @param currentMention - * @param candidateAntecedent - * @param cm - * @return - */ - override def matchingFunction(currentMention: Mention, candidateAntecedent: Mention, cm: MentionClusterManager): Boolean = { - val currentMentionMostRepresentative: Mention = cm.getCluster(currentMention).mostRepresentativeMention - val candidateAntecedentMostRepresentative: Mention = cm.getCluster(candidateAntecedent).mostRepresentativeMention - if (currentMentionMostRepresentative.attr[MentionCharacteristics].isProper && candidateAntecedentMostRepresentative.attr[MentionCharacteristics].isProper) { - val sameSynset: Boolean = wn.areSynonyms(refineMentionStringForKB(currentMentionMostRepresentative), refineMentionStringForKB(candidateAntecedentMostRepresentative)) - val corefFeaturesAlias: Boolean = CorefFeatures.canBeAliases(currentMention, candidateAntecedent) - if (performDebug) { - val start: Int = candidateAntecedent.phrase.start - candidateAntecedent.phrase.sentence.start - val end: Int = candidateAntecedent.phrase.end - candidateAntecedent.phrase.sentence.start - debugPrint("" + ifTrueX(sameSynset) + ifTrueX(corefFeaturesAlias) + "") - } - return sameSynset || corefFeaturesAlias - } - return false - } -} - -/** - * The Lexical Chain Sieve. A sieve that resolves mentions by using synonym paths in WordNet. - * @param debugDirectory (optional) the name of a directory to write debug output - */ -class LexicalChainSieve(CFUtil: CorefUtil, debugDirectory: String = "") extends SemanticSimilaritySieve(CFUtil, debugDirectory) { - - override val name: String = "LexicalChainSieve" - - // Debug settings - override lazy val debugOutputDir: String = debugDirectory - override val debugHTMLTableStart: String = "
Candidate Candidate Sent No Candidate Span Synonyms Coref Feature Alias
" + candidateAntecedent.phrase.string + " " + candidateAntecedent.phrase.sentence.indexInSection + " " + start + "-" + end + "
" - - /** - * The matching function of the Lexical Chain Sieve resolves a mention to a candidate antecedent if the sentence distance between the - * two mentions is less than 3, the clusters of both the mention and antecedent agree on all attributes, the location & numerical modifiers - * of each are consistent, and there is a path of length 4 or less between the synsets of the two mentions (including all senses of the words). - * @param currentMention - * @param candidateAntecedent - * @param cm - * @return - */ - override def matchingFunction(currentMention: Mention, candidateAntecedent: Mention, cm: MentionClusterManager): Boolean = { - // if attributes agree and mentions are less than three sentences apart, and they share a synonym, the match is ok - val currentMentionCluster: MentionCluster = cm.getCluster(currentMention) - val candidateAntecedentCluster: MentionCluster = cm.getCluster(candidateAntecedent) - if (CFUtil.sentenceDistance(currentMention, candidateAntecedent) < 3 && - currentMentionCluster.agreesInAllAttributesWith(candidateAntecedentCluster) && - CFUtil.agreesInLocation(currentMention, candidateAntecedent) && - CFUtil.agreementBetweenModifiersWhichAreNumbers(currentMention, candidateAntecedent)) { - val currentMentionSynonyms: HashSet[String] = getAllSynonyms(refineMentionStringForKB((currentMentionCluster.mostRepresentativeMention))) - val candidateAntecedentSynonyms: HashSet[String] = getAllSynonyms(refineMentionStringForKB((candidateAntecedentCluster.mostRepresentativeMention))) - val res: Boolean = (currentMentionSynonyms.intersect(candidateAntecedentSynonyms).size > 0) - - if (performDebug) { - val start: Int = candidateAntecedent.phrase.start - candidateAntecedent.phrase.sentence.start - val end: Int = candidateAntecedent.phrase.end - candidateAntecedent.phrase.sentence.start - debugPrint("" + ifTrueX(res) + "") - } - return res - } - return false - } - - /** - * Traverses the synsets of the word in WordNet gathering all possible synonyms (from all senses of the word) - * up to levelsDeep synsets - * @param mentionLemma - lemma used as input to wordnet - * @param levelsDeep - number of synset layers to traverse - * @return - */ - private def getAllSynonyms(mentionLemma: String, levelsDeep: Int = 4): HashSet[String] = { - var allSynonyms: HashSet[String] = HashSet[String]() - var nextLemmas: HashSet[String] = HashSet[String](mentionLemma) - var nextSynsets: Seq[Synset] = Seq[Synset]() - for (i <- 0 to levelsDeep by 1) { - for (lemma <- nextLemmas) { - allSynonyms += lemma - nextSynsets = nextSynsets ++ wn.synsets(lemma) - } - nextLemmas = HashSet[String]() - for (syn <- nextSynsets) { - nextLemmas += syn.id - } - } - return allSynonyms - } -} - - -/** - * A management system of the mention clusters used by the deterministic coreference system - * @param mentions - */ -class MentionClusterManager(mentions: Seq[Mention], CFUtil: CorefUtil) { - - // A mapping from the uniqueId of the mentions to their integer cluster id number - private var _mention2clusterid: HashMap[String, Int] = HashMap[String, Int]() - // A mapping from the integer cluster id number to the corresponding cluster - private var _clusterid2cluster: HashMap[Int, MentionCluster] = HashMap[Int, MentionCluster]() - - /* Initialize the clusters for each of the mentions */ - var count: Int = 0 - for (mention <- mentions) { - _mention2clusterid += (mention.uniqueId -> count) - _clusterid2cluster += (count -> new MentionCluster(CFUtil)) - _clusterid2cluster(count).addMention(mention) - count += 1 - } - - /** - * Merges the currentMention's cluster with the candidateAntecedent's cluster - * that is it adds every element of the currentMention's cluster to the candidateAntecedent's cluster - * and destroys the old cluster. - * @param currentMention - * @param candidateAntecedent - */ - def mergeClusters(currentMention: Mention, candidateAntecedent: Mention): Unit = { - val cmClusterId: Int = getClusterId(currentMention) - val cmCluster: MentionCluster = getCluster(currentMention) - for (mention <- cmCluster.mentions) { - setCluster(mention, candidateAntecedent) - } - _clusterid2cluster.remove(cmClusterId) - } - - - /** - * Returns the cluster of the given mention - * @param mention - a mention from the document - * @return the MentionCluster of the mention - */ - def getCluster(mention: Mention): MentionCluster = { - return _clusterid2cluster(_mention2clusterid(mention.uniqueId)) - } - - /** - * Returns true if the given mention is the mention in its cluster which appears - * earliest in the document. - * @param mention - * @return - */ - def isFirstInCluster(mention: Mention): Boolean = { - val mentionsCluster: MentionCluster = this.getCluster(mention) - return (mention eq mentionsCluster.firstMention) - } - - /** - * Returns an HTML formatted string of the cluster assignments. - * @return - */ - def toHTMLString: String = { - var s: String = "


Current Cluster Assignments

" - for (paircidcluster <- _clusterid2cluster) { - if (paircidcluster._2.mentions.size > 0) { - s += "\n
Candidate Candidate Sent No Candidate Span Have WordNet Path (max length 4)
" + candidateAntecedent.phrase.string + " " + candidateAntecedent.phrase.sentence.indexInSection + " " + start + "-" + end + "
\n " - for (mention <- paircidcluster._2.mentions) { - val start: Int = mention.phrase.start - mention.phrase.sentence.start - val end: Int = mention.phrase.end - mention.phrase.sentence.start - s += "\n " - } - s += "\n
Mention Sentence Mention Span Mention
" + mention.phrase.sentence.indexInSection + " " + start + "-" + end + " " + mention.phrase.string + "




" - } - } - return s - } - - - /** - * Sets the cluster id of mention to the passed in value - * @param mention - * @param new_cluster_id - */ - private def setCluster(mention: Mention, new_cluster_id: Int): Unit = { - _clusterid2cluster(_mention2clusterid(mention.uniqueId)).removeMention(mention) - _clusterid2cluster(new_cluster_id).addMention(mention) - _mention2clusterid.update(mention.uniqueId, new_cluster_id) - } - - /** - * Sets the cluster id of mention1 to be that of mention2, that is - * it places mention1 in mention2's cluster. - * @param mention1 - * @param mention2 - */ - private def setCluster(mention1: Mention, mention2: Mention): Unit = { - setCluster(mention1, _mention2clusterid(mention2.uniqueId)) - } - - /** - * Returns the cluster ID number of the given mention - * @param mention - * @return - */ - private def getClusterId(mention: Mention): Int = { - return _mention2clusterid(mention.uniqueId) - } -} - - - - -/** - * A representation of the intermediary clusters of mentions used by the deterministic coreference system. - * The clusters maintain information about various attributes of their mentions. - */ -class MentionCluster (CFUtil: CorefUtil) { - /* - * An internal data structure used for management of the attributes of the cluster - */ - private object MentionClusterAttribute extends Enumeration { - type MentionClusterAttribute = Value - val NUMBER, GENDER, PERSON, ANIMACY, NER = Value - } - - - /* - * A mapping from the uniqueId field of the mentions to the mentions themselves. - * Note that we could not just keep a HashSet of the mentions, because hashing - * on the Mention objects themselves will cause problems - */ - private val _mentionMap: HashMap[String,Mention] = HashMap[String,Mention]() - - /* - * A Seq of all of the mentions in the cluster - */ - private var _mentions: Seq[Mention] = Seq[Mention]() - - /* - * A Seq of all of the tokens of the mentions in the cluster - */ - private var _allTokens: Seq[Token] = Seq[Token]() - - /* - * A hashmap from each of the MentionCluster attributes to the HashSet which - * stores the values of the attributes for each mention in the cluster - */ - private val _attributes: HashMap[MentionClusterAttribute.Value, HashSet[Int]] = HashMap[MentionClusterAttribute.Value, HashSet[Int]]() - // Initialize the _attribute structure - for (at <- MentionClusterAttribute.values) { - _attributes.put(at, HashSet[Int]()) - } - - /* - * The mention in the cluster which appears first in the document - */ - private var _firstMention: Mention = null - - /* - * The "most representative" mention in the cluster, see method definition - * for explanation. - */ - private var _mostRepresentativeMention: Mention = null - - - /** - * Returns an formatted string of the mentions in the cluster, in the form - * "{:(,):}" - */ - override def toString: String = { - var s: String = "{ " - for (m <- _mentions) { - s += m.phrase.sentence.indexInSection + "(" + m.phrase.start +"," + m.phrase.end + "):" + m.phrase.string + " " - } - s += "}" - return s - } - - /** - * Returns an HTML formatted string of the mentions in the cluster - */ - def toStringHTML: String = { - var s: String = "{ " - for (m <- _mentions) { - s += m.phrase.sentence.indexInSection + "(" + m.phrase.start +"," + m.phrase.end + "):" + m.phrase.string + " " - } - s += "}" - return s - } - - /** - * Returns a Seq of all of the mentions in the cluster - */ - def mentions: Seq[Mention] = { - _mentions - } - - /** - * Adds a mention to the cluster - * @param mention - the mention to add - */ - def addMention(mention: Mention): Unit = { - - // Update the mention Map - _mentionMap.put(mention.uniqueId, mention) - - // Update _mentions - _mentions = _mentionMap.values.toSeq.sortBy(m => (m.attr[DeterministicCorefCache].absoluteSentenceNumber, m.phrase.start, m.phrase.end)) - - // Update first mention - _firstMention = _mentions(0) - - // Update most representative mention - if (_mostRepresentativeMention == null) { - _mostRepresentativeMention = mention - } else { - for (mention <- _mentionMap.values) { - if (mention.uniqueId != _mostRepresentativeMention.uniqueId) - _mostRepresentativeMention = CFUtil.moreRepresentativeOf(_mostRepresentativeMention, mention) - } - } - // _mostRepresentativeMention = CorefUtil.moreRepresentativeOf(_mostRepresentativeMention, mention) - // Update each set of attributes & _allTokens - _attributes(MentionClusterAttribute.NUMBER).add(mention.phrase.attr[Number].intValue) - _attributes(MentionClusterAttribute.GENDER).add(mention.phrase.attr[Gender].intValue) - _attributes(MentionClusterAttribute.PERSON).add(CFUtil.getPerson(mention)) - _attributes(MentionClusterAttribute.ANIMACY).add(CFUtil.getAnimacy(mention)) - if (mention.phrase.attr.contains(classOf[OntonotesPhraseEntityType])) { - _attributes(MentionClusterAttribute.NER).add(mention.phrase.attr[OntonotesPhraseEntityType].intValue) - } else { - _attributes(MentionClusterAttribute.NER).add(OntonotesEntityTypeDomain.O) - } - - - _allTokens = _allTokens ++ mention.phrase.tokens - } - - - /** - * Removes a mention from the cluster - * @param mention - The mention to remove - */ - def removeMention(mention: Mention): Unit = { - _mentionMap.remove(mention.uniqueId) - } - - /** - * Returns the mention in the cluster which appears first in the document - */ - def firstMention: Mention = { - _firstMention - } - - /** - * Returns a Seq of all of the tokens of the mentions in the cluster - */ - def allTokens: Seq[Token] = { - _allTokens - } - - /** - * Returns a HashSet of the String representation of the gender attributes of the mentions in the cluster - */ - def numberAttributes: HashSet[Int] = { - _attributes(MentionClusterAttribute.NUMBER) - } - - /** - * Returns a HashSet of the String representation of the gender attributes of the mentions in the cluster - */ - def genderAttributes: HashSet[Int] = { - _attributes(MentionClusterAttribute.GENDER) - } - - /** - * Returns a HashSet of the String representation of the person attributes of the mentions in the cluster - */ - def personAttributes: HashSet[Int] = { - _attributes(MentionClusterAttribute.PERSON) - } - - - /** - * Returns a HashSet of the String representation of the animacy attributes of the mentions in the cluster - */ - def animacyAttributes: HashSet[Int] = { - _attributes(MentionClusterAttribute.ANIMACY) - } - - /** - * Returns a HashSet of the String representation of the NER labels of the mentions in the cluster - */ - def nerAttributes: HashSet[Int] = { - _attributes(MentionClusterAttribute.NER) - } - - /** - * Returns the "most representative mention" in the cluster. - * The representativeness of a mention is determined as a total - * ordering of mentions such that: all proper nouns are more representative - * than common nouns, and all common nouns more representative than pronouns. - * Mentions with the same part of speech are ordered first by - * their distance Start of the sentence (smaller distance is better), the section - * of a mention (lower index is better), the sentence position in section (lower index is better) - * head position in Sentence (earlier is better), the length of mention (if length < 5, shorter length is better, other longer length is better) - */ - def mostRepresentativeMention: Mention = { - _mostRepresentativeMention - } - -/** - * Returns true if this at least one of this cluster's mentions has the same - * number attribute as one of the other cluster's mentions or if - * a mention in either cluster has unknown number. - * @param otherCluster - */ - def agreesInNumberWith(otherCluster: MentionCluster): Boolean = { - val thisMentionNumbers: HashSet[Int] = this.numberAttributes - val otherMentionNumbers: HashSet[Int] = otherCluster.numberAttributes - - if (thisMentionNumbers.contains(NumberDomain.UNKNOWN) || otherMentionNumbers.contains(NumberDomain.UNKNOWN)) - return true - val thisSETDIFFotherISEMPTY: Boolean = (thisMentionNumbers.diff(otherMentionNumbers).size == 0) - val otherSETDIFFthisISEMPTY: Boolean = (otherMentionNumbers.diff(thisMentionNumbers).size == 0) - return (thisSETDIFFotherISEMPTY || otherSETDIFFthisISEMPTY) - } - - - - /** - * Returns true if any of the following conditions hold: at least one of this cluster's - * mentions has the same gender attribute as one of the other cluster's mentions; - * a mention in either cluster has unknown gender; - * or a mention in one cluster has gender "PERSON" and a mention in the other cluster - * has a gender of either male or female. - * @param otherCluster - */ - def agreesInGenderWith(otherCluster: MentionCluster): Boolean = { - val thisMentionGenders: HashSet[Int] = this.genderAttributes - val otherMentionGenders: HashSet[Int] = otherCluster.genderAttributes - - if (thisMentionGenders.contains(GenderDomain.UNKNOWN) || otherMentionGenders.contains(GenderDomain.UNKNOWN)) - return true - - // Handle case where one contains PERSON and the other contains either male or female. - if (thisMentionGenders.contains(GenderDomain.PERSON) && (otherMentionGenders.contains(GenderDomain.MALE) || otherMentionGenders.contains(GenderDomain.FEMALE))) - return true - if (otherMentionGenders.contains(GenderDomain.PERSON) && (thisMentionGenders.contains(GenderDomain.MALE) || thisMentionGenders.contains(GenderDomain.FEMALE))) - return true - - val thisSETDIFFotherISEMPTY: Boolean = (thisMentionGenders.diff(otherMentionGenders).size == 0) - val otherSETDIFFthisISEMPTY: Boolean = (otherMentionGenders.diff(thisMentionGenders).size == 0) - return (thisSETDIFFotherISEMPTY || otherSETDIFFthisISEMPTY) - } - - -/** - * Returns true if this at least one of this cluster's mentions has the same - * person attribute as one of the other cluster's mentions. - * @param otherCluster - */ - def agreesInPersonWith(otherCluster: MentionCluster): Boolean = { - val thisMentionPerson: HashSet[Int] = this.personAttributes - val otherMentionPerson: HashSet[Int] = otherCluster.personAttributes - val thisSETDIFFotherISEMPTY: Boolean = (thisMentionPerson.diff(otherMentionPerson).size == 0) - val otherSETDIFFthisISEMPTY: Boolean = (otherMentionPerson.diff(thisMentionPerson).size == 0) - return (thisSETDIFFotherISEMPTY || otherSETDIFFthisISEMPTY) - } - - - - /** - * Returns true if this at least one of this cluster's mentions has the same - * animacy attribute as one of the other cluster's mentions or if - * a mention in either cluster has unknown animacy. - * @param otherCluster - */ - def agreesInAnimacyWith(otherCluster: MentionCluster): Boolean = { - val thisMentionAnimacies: HashSet[Int] = this.animacyAttributes - val otherMentionAnimacies: HashSet[Int] = otherCluster.animacyAttributes - - if (thisMentionAnimacies.contains(DCorefAnimacyDomain.UNKNOWN) || otherMentionAnimacies.contains(DCorefAnimacyDomain.UNKNOWN)) - return true - - val thisSETDIFFotherISEMPTY: Boolean = (thisMentionAnimacies.diff(otherMentionAnimacies).size == 0) - val otherSETDIFFthisISEMPTY: Boolean = (otherMentionAnimacies.diff(thisMentionAnimacies).size == 0) - - return (thisSETDIFFotherISEMPTY || otherSETDIFFthisISEMPTY) - - } - - - /** - * Returns true if this cluster's mentions and the other cluster's mentions - * have at least one NER label in common or if one of the cluster's mentions - * have an NER label of "O" or "MISC" - * @param otherCluster - */ - def agreesInNERLabelsWith(otherCluster: MentionCluster): Boolean = { - val thisMentionNER: HashSet[Int] = this.nerAttributes - val otherMentionNER: HashSet[Int] = otherCluster.nerAttributes - if (thisMentionNER.contains(OntonotesEntityTypeDomain.O) || thisMentionNER.contains(OntonotesEntityTypeDomain.MISC) || otherMentionNER.contains(OntonotesEntityTypeDomain.O) || otherMentionNER.contains(OntonotesEntityTypeDomain.MISC)) - return true - val thisSETDIFFotherISEMPTY: Boolean = (thisMentionNER.diff(otherMentionNER).size == 0) - val otherSETDIFFthisISEMPTY: Boolean = (otherMentionNER.diff(thisMentionNER).size == 0) - - return (thisSETDIFFotherISEMPTY || otherSETDIFFthisISEMPTY) - - } - - - /** - * Returns a string representation of the cluster's attributes. - * of the form "{ || || || || }" - */ - def attributeString: String = { - val res: String = "{" + this.numberAttributes.toString + " || " + this.genderAttributes.toString + " || " + this.personAttributes.toString + " || " + this.animacyAttributes.toString + " || " + this.nerAttributes.toString + "}" - return res.replaceAll("Set", "") - } - - - /** - * Returns true only if this cluster agrees with the passed in cluster in - * the attributes of number, gender, person, animacy, and NER labels. - * Please refer to the individual agreement methods for explanation of - * the definition of agreement in these attributes. - * @param otherCluster - */ - def agreesInAllAttributesWith(otherCluster: MentionCluster): Boolean = { - val res1: Boolean = this.agreesInNumberWith(otherCluster) - val res2: Boolean = this.agreesInGenderWith(otherCluster) - val res3: Boolean = this.agreesInPersonWith(otherCluster) - val res4: Boolean = this.agreesInAnimacyWith(otherCluster) - val res5: Boolean = this.agreesInNERLabelsWith(otherCluster) - return (res1 && res2 && res3 && res4 && res5) - } - - /** - * Returns an Seq of 5 booleans, such that the positions in the Seq correspond to - * agreement with otherCluster in the attributes in the following order: [number, gender, person, animacy, nerlabel] - * @param otherCluster - */ - def attributeAgreement(otherCluster: MentionCluster): Seq[Boolean] = { - val res1: Boolean = this.agreesInNumberWith(otherCluster) - val res2: Boolean = this.agreesInGenderWith(otherCluster) - val res3: Boolean = this.agreesInPersonWith(otherCluster) - val res4: Boolean = this.agreesInAnimacyWith(otherCluster) - val res5: Boolean = this.agreesInNERLabelsWith(otherCluster) - return Seq[Boolean](res1, res2, res3, res4, res5) - } - -} diff --git a/src/main/scala/cc/factorie/app/nlp/coref/DeterministicNamedCoref.scala b/src/main/scala/cc/factorie/app/nlp/coref/DeterministicNamedCoref.scala deleted file mode 100644 index 2d02e7e..0000000 --- a/src/main/scala/cc/factorie/app/nlp/coref/DeterministicNamedCoref.scala +++ /dev/null @@ -1,52 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.app.nlp.coref - -import cc.factorie.app.nlp._ -import cc.factorie.app.nlp.pos.PennPosTag - -/** A dead-simple deterministic coreference system that operates only on named entities - and resolves coreference only by exact string match. */ -object DeterministicNamedCoref extends DeterministicNamedCoref(ConllPhraseFinder) - -class DeterministicNamedCoref(phraseFinder:MentionPhraseFinder) extends DocumentAnnotator { - def prereqAttrs: Seq[Class[_]] = phraseFinder.prereqAttrs ++ Seq(classOf[PennPosTag]) - def postAttrs = Seq(classOf[WithinDocCoref]) - def tokenAnnotationString(token: Token): String = { - val entities = token.document.coref.entities.toSeq - token.document.coref.mentions.find(m => m.phrase.contains(token)) match { - case Some(mention) => - val mtokens = mention.phrase.tokens - if (mtokens.length == 1) "(" + entities.indexOf(mention.entity) + ")" - else if (mtokens.indexOf(token) == 0) "(" + entities.indexOf(mention.entity) - else if (mtokens.indexOf(token) == mtokens.length) entities.indexOf(mention.entity) + ")" - else "_" - case None => "_" - } - } - def process(document: Document) = { - val phrases = phraseFinder(document) - val coref = new WithinDocCoref(document) - for (phrase <- phrases) { - val targetString = phrase.tokensString(" ") - // Find an entity whose canonical mention is an exact string match - val entityOption = coref.entities.find(_.canonicalMention.string == targetString) - if (entityOption.isDefined) coref.addMention(phrase, entityOption.get) - else {val entity = coref.newEntity(); val mention = coref.addMention(phrase, entity); entity.canonicalMention = mention} - } - document.attr += coref - if (!document.annotators.contains(classOf[WithinDocCoref])) - document.annotators(classOf[WithinDocCoref]) = this.getClass - document - } -} diff --git a/src/main/scala/cc/factorie/app/nlp/coref/ForwardCoref.scala b/src/main/scala/cc/factorie/app/nlp/coref/ForwardCoref.scala index 4db92b8..75b589f 100644 --- a/src/main/scala/cc/factorie/app/nlp/coref/ForwardCoref.scala +++ b/src/main/scala/cc/factorie/app/nlp/coref/ForwardCoref.scala @@ -1,426 +1,7 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - package cc.factorie.app.nlp.coref - -import cc.factorie.app.nlp.lexicon.{LexiconsProvider, StaticLexicons} -import java.io._ -import java.util.concurrent.ExecutorService - -import cc.factorie.app.nlp.phrase._ -import cc.factorie.app.nlp.pos.PennPosTag -import cc.factorie.app.nlp.wordnet.WordNet -import cc.factorie.app.nlp.{Document, DocumentAnnotator, Token} -import cc.factorie.optimize._ -import cc.factorie.util._ - -import scala.collection.mutable.ArrayBuffer - -/**Forward Coreference on Proper Noun, Pronoun and Common Noun Mentions*/ -class ParseForwardCoref extends ForwardCoref { - override def prereqAttrs: Seq[Class[_]] = ParseAndNerBasedPhraseFinder.prereqAttrs.toSeq ++ ForwardCoref.prereqAttrs - override def annotateMentions(document:Document): Unit = { - if(document.coref.mentions.isEmpty) ParseAndNerBasedPhraseFinder.getPhrases(document).foreach(document.coref.addMention) - document.coref.mentions.foreach(mention => NounPhraseEntityTypeLabeler.process(mention.phrase)) - document.coref.mentions.foreach(mention => NounPhraseGenderLabeler.process(mention.phrase)) - document.coref.mentions.foreach(mention => NounPhraseNumberLabeler.process(mention.phrase)) - } -} - -object ParseForwardCoref extends ParseForwardCoref { - deserialize(new DataInputStream(ClasspathURL[ParseForwardCoref](".factorie").openConnection().getInputStream)) -} - -/** Forward Coreference on Ner and Pronoun Mentions*/ -class NerForwardCoref extends ForwardCoref { - override def prereqAttrs: Seq[Class[_]] = (ConllPhraseFinder.prereqAttrs ++ AcronymNounPhraseFinder.prereqAttrs++PronounFinder.prereqAttrs ++ NnpPosNounPhraseFinder.prereqAttrs ++ ForwardCoref.prereqAttrs).distinct - override def annotateMentions(document:Document): Unit = { - if(document.coref.mentions.isEmpty) (ConllPhraseFinder(document) ++ PronounFinder(document) ++ NnpPosNounPhraseFinder(document)++ AcronymNounPhraseFinder(document)).distinct.foreach(phrase => document.getCoref.addMention(phrase)) - document.coref.mentions.foreach(mention => NounPhraseEntityTypeLabeler.process(mention.phrase)) - document.coref.mentions.foreach(mention => NounPhraseGenderLabeler.process(mention.phrase)) - document.coref.mentions.foreach(mention => NounPhraseNumberLabeler.process(mention.phrase)) - } -} - -object NerForwardCoref extends NerForwardCoref { - deserialize(new DataInputStream(ClasspathURL[NerForwardCoref](".factorie").openConnection().getInputStream)) -} - class ForwardCoref extends ForwardCorefBase { val model = new BaseCorefModel } object ForwardCoref extends ForwardCoref - -class ForwardCorefImplicitConjunctions extends ForwardCorefBase { - val model = new ImplicitCrossProductCorefModel -} - -abstract class ForwardCorefBase extends CorefSystem[Seq[MentionPairLabel]] { - val options = new CorefOptions - val model:PairwiseCorefModel - - - /**Store head words which are seen over a default 20 times in the model - * @param trainDocs Documents to generate counts from*/ - def preprocessCorpus(trainDocs:Seq[Document]) = { - val nonPronouns = trainDocs.flatMap(_.targetCoref.mentions.filterNot(m => m.phrase.isPronoun)) - model.CorefTokenFrequencies.counter = new TopTokenFrequencies(TokenFreqs.countWordTypes(nonPronouns,(t) => t.phrase.headToken.string.toLowerCase,20)) - } - - def instantiateModel(optimizer:GradientOptimizer,pool:ExecutorService) = new LeftRightParallelTrainer(optimizer,pool) - - /**Generate the labels used for training - * @param coref This is expected to be the true coreference class for the document - * @return Sequence of training labels for this document*/ - def getCorefStructure(coref:WithinDocCoref): Seq[MentionPairLabel] = { - val mentions = coref.mentions.sortBy(m=>m.phrase.start) - assertSorted(mentions) - val labels = new ArrayBuffer[MentionPairLabel] - for (i <- 0 until mentions.size){ - if(!options.usePronounRules || !mentions(i).phrase.isPronoun) - labels ++= generateTrainingLabelsForOneAnaphor(mentions, i) - } - labels - } - - /** - * Given the index of a mention, create positive and negative labels for this mention and its prodecessors - * @param orderedMentions Mentions for this document - * @param anaphorIndex Index of current mention to generate labels for - * @return Training Labels for this Mention */ - protected def generateTrainingLabelsForOneAnaphor(orderedMentions: Seq[Mention], anaphorIndex: Int): Seq[MentionPairLabel] = { - val labels = new ArrayBuffer[MentionPairLabel] - val m1 = orderedMentions(anaphorIndex) - var numAntecedents = 0 - var i = anaphorIndex - 1 - while (i >= 0 && (numAntecedents < options.numPositivePairsTrain || !options.pruneNegTrain)) { - val m2 = orderedMentions(i) - val label = m1.entity != null & m1.entity == m2.entity - if (!pruneMentionPairTraining(m1,m2,label,numAntecedents)) { - val cl = new MentionPairLabel(model, m1, m2, orderedMentions, label, options=options) - if(label) numAntecedents += 1 - labels += cl - } - i -= 1 - } - labels - } - case class MentionPairLabelFeatures(label: MentionPairLabel,features: MentionPairFeatures) - - /** Given a sequence of MentionPairLabels for a document, compute features of the pair and return both*/ - protected def generateFeatures(labels: Seq[MentionPairLabel]): Seq[MentionPairLabelFeatures] = { - val previousLabels = new ArrayBuffer[MentionPairLabelFeatures]() - labels.foreach{ label => - val candidateLabelFeatures = label.genFeatures() - //If we want to merge features of our antecedent with any of it's previous mentions, - if(options.mergeFeaturesAtAll && label.mention2.entity != null){ - val matchingPreviousLabelsFeatures = previousLabels.lastIndexWhere(l => l.label.mention2.entity == label.mention2.entity) - if(matchingPreviousLabelsFeatures != -1) mergeFeatures(candidateLabelFeatures, previousLabels(matchingPreviousLabelsFeatures).features) - } - previousLabels += new MentionPairLabelFeatures(label,candidateLabelFeatures) - } - previousLabels - } - - class LeftRightParallelTrainer(optimizer: GradientOptimizer, pool: ExecutorService, miniBatchSize: Int = 1) extends ParallelTrainer(optimizer,pool){ - def map(in: Seq[MentionPairLabel]): Seq[Example] = { - // |**("Adding Features for Labels") - val examples = MiniBatchExample(miniBatchSize,generateFeatures(in).map{trainingInstance => model.getExample(trainingInstance.label,trainingInstance.features,options.slackRescale)}) - // **| - examples - } - } - - def mergeFeatures(l: MentionPairFeatures, mergeables: MentionPairFeatures) { - if (options.mergeFeaturesAtAll) { - assert(l.features.activeCategories.forall(!_.startsWith("NBR"))) - val mergeLeft = ArrayBuffer[MentionPairFeatures]() - l.features ++= mergeables.features.mergeableAllFeatures.map("NBRR_" + _) - } - } - - /**Types of Pairs Pruned during Training - * - cataphora since we do not corefer these - * - Any pair of mentions which overlap each other*/ - def pruneMentionPairTraining(anaphor: Mention,antecedent: Mention,label: Boolean,numAntecedents: Int): Boolean = { - val cataphora = antecedent.phrase.isPronoun && !anaphor.phrase.isPronoun - if(cataphora) { - if (label && !options.allowPosCataphora || !label && !options.allowNegCataphora) { - return true - } - } - if(!anaphor.phrase.tokens.intersect(antecedent.phrase.tokens).isEmpty) return true - if (label && numAntecedents > 0 && !options.pruneNegTrain) return true - return false - } - def pruneMentionPairTesting(anaphor: Mention,antecedent: Mention): Boolean = { - val cataphora = antecedent.phrase.isPronoun && !anaphor.phrase.isPronoun - if(options.usePronounRules && antecedent.phrase.isPronoun) return true - else if(cataphora || options.allowTestCataphora) return true - if(!anaphor.phrase.tokens.intersect(antecedent.phrase.tokens).isEmpty) return true - return false - } - - /**Find each mentions best scoring antecedent. If the antecedent has a cluster add the new mention if not, create a new entity and add both mentions - * Currently does not create singleton entities - * @param coref Expects nontarget coref class that is pre annotated with mentions - * @return - */ - def infer(coref: WithinDocCoref): WithinDocCoref = { - val mentions = coref.mentions.sortBy(m => m.phrase.start) - for (i <- 0 until coref.mentions.size) { - val m1 = mentions(i) - val bestCand = getBestCandidate(coref,mentions, i) - if (bestCand != null) { - if(bestCand.entity ne null){ - bestCand.entity += m1 - } - else{ - val entity = coref.newEntity(); entity += bestCand; entity += m1 - } - }else {val entity = coref.newEntity(); entity += m1} - } - coref - } - - def getBestCandidate(coref: WithinDocCoref, mentions: Seq[Mention], mInt: Int): Mention = { - val candidateLabels = ArrayBuffer[MentionPairFeatures]() - var bestCandidate: Mention = null - var bestScore = Double.MinValue - var anteIdx = mInt - val m1 = mentions(mInt) - var numPositivePairs = 0 - while (anteIdx >= 0 && (numPositivePairs < options.numPositivePairsTest || !options.pruneNegTest)) { - val m2 = mentions(anteIdx) - if (!pruneMentionPairTesting(m1,m2)) { - val candidateLabel = new MentionPairFeatures(model, m1, m2, mentions, options=options) - val mergeables = candidateLabels.lastIndexWhere(l => l.mention2.entity != null &&l.mention2.entity == candidateLabel.mention2.entity) - if(mergeables != -1) mergeFeatures(candidateLabel, candidateLabels(mergeables)) - candidateLabels += candidateLabel - val score = if (m1.phrase.isProperNoun && m1.attr[MentionCharacteristics].nounWords.forall(m2.attr[MentionCharacteristics].nounWords.contains) - && m2.attr[MentionCharacteristics].nounWords.forall(m1.attr[MentionCharacteristics].nounWords.contains) - || options.mergeMentionWithApposition && (m1.phrase.isAppositionOf(m2.phrase) - || m2.phrase.isAppositionOf(m1.phrase))) Double.PositiveInfinity - else model.predict(candidateLabel.value) - if (score > 0.0) { - numPositivePairs += 1 - if (bestScore <= score) { - bestCandidate = m2 - bestScore = score - } - } - } - anteIdx -= 1 - } - bestCandidate - } -} - - - - -/**Base class for any coreference system - * @tparam CoreferenceStructure The type used as a training instance, ex. MentionPairLabel or MentionGraph, - * In the examples above, the training instance is either one pair or the whole document respectively*/ -abstract class CorefSystem[CoreferenceStructure] extends DocumentAnnotator with Trackable{ - val model:CorefModel - val options:CorefOptions - def prereqAttrs: Seq[Class[_]] = Seq(classOf[Token],classOf[PennPosTag]) - def postAttrs = Seq(classOf[WithinDocCoref]) - def tokenAnnotationString(token:Token): String = { - val entities = token.document.coref.entities.toSeq - var outputString = token.document.coref.mentions.filter(mention => mention.phrase.contains(token)) match { - case ms:Seq[Mention] if ms.length > 0 => ms.filter(m => m.entity != null && !m.entity.isSingleton).map{ - m => if (m.phrase.length == 1) "("+entities.indexOf(m.entity)+")" - else if(m.phrase.indexOf(token) == 0) "("+entities.indexOf(m.entity) - else if(m.phrase.indexOf(token) == m.phrase.length - 1) entities.indexOf(m.entity)+")" - else "" - }.mkString("|") - case _ => "_" - } - if(outputString == "") outputString = "_" - else if(outputString.endsWith("|")) outputString = outputString.substring(0,outputString.length-1) - "%15s".format(outputString) - } - - def process(document: Document) = { - document.annotators += classOf[WithinDocCoref] -> this.getClass - if(document.getCoref.mentions.isEmpty) - annotateMentions(document) - infer(document.getCoref) - document - } - - def annotateMentions(document: Document): Unit = { - if(options.useGoldBoundaries){ - assert(document.targetCoref ne null,"Gold Boundaries cannot be used without gold data.") - document.targetCoref.mentions.foreach{m => - if(options.useEntityType){ - val newMention = document.getCoref.addMention(new Phrase(m.phrase.value.chain,m.phrase.start,m.phrase.length,m.phrase.headTokenOffset)) - newMention.phrase.attr += m.phrase.attr[OntonotesPhraseEntityType] - newMention.phrase.attr += m.phrase.attr[NounPhraseType] - } - else { - val newMention = document.getCoref.addMention(new Phrase(m.phrase.value.chain,m.phrase.start,m.phrase.length,m.phrase.headTokenOffset)) - NounPhraseEntityTypeLabeler.process(newMention.phrase) - newMention.phrase.attr += m.phrase.attr[NounPhraseType] - } - } - NounPhraseGenderLabeler.process(document) - MentionPhraseNumberLabeler.process(document) - } - } - - /**Perform any preprocessing such as getting top used words - * @param trainDocs Documents to generate counts from */ - def preprocessCorpus(trainDocs: Seq[Document]): Unit - - /**Returns training labels for data in the format that should be used for training - * @param coref Gold Coref to be used for training */ - def getCorefStructure(coref: WithinDocCoref): CoreferenceStructure - def instantiateModel(optimizer: GradientOptimizer,pool: ExecutorService): ParallelTrainer - def infer(doc: WithinDocCoref): WithinDocCoref - - abstract class ParallelTrainer(optimizer: GradientOptimizer, val pool: ExecutorService) { - def map(in: CoreferenceStructure): Seq[Example] - def reduce(states: Iterable[Seq[Example]]) { - for (examples <- states) { - val trainer = new OnlineTrainer(model.parameters, optimizer, maxIterations = 1, logEveryN = examples.length - 1) - trainer.trainFromExamples(examples) - } - } - def runParallel(ins: Seq[CoreferenceStructure]){ - reduce(cc.factorie.util.Threading.parMap(ins, pool)(map)) - } - def runSequential(ins: Seq[CoreferenceStructure]){ - reduce(ins.map(map)) - } - } - - - // todo fix this - @deprecated("This exists to preserve prior behavior, it should be a constructor argument", "10/5/15") - val lexicon = new StaticLexicons()(LexiconsProvider.classpath()) - - def train(trainDocs: Seq[Document], testDocs: Seq[Document], wn: WordNet, rng: scala.util.Random, saveModelBetweenEpochs: Boolean,saveFrequency: Int,filename: String, learningRate: Double = 1.0): Double = { - val optimizer = if (options.useAverageIterate) new AdaGrad(learningRate) with ParameterAveraging else if (options.useAdaGradRDA) new AdaGradRDA(rate = learningRate,l1 = options.l1) else new AdaGrad(rate = learningRate) - for(doc <- trainDocs; mention <- doc.targetCoref.mentions) mention.attr += new MentionCharacteristics(mention, lexicon) - preprocessCorpus(trainDocs) - |**("Training Structure Generated") - var i = 0 - val trainingFormat: Seq[CoreferenceStructure] = trainDocs.map{doc => i +=1 ; if(i % 100 == 0) println("Processing Labels for: " + i + " of " + trainDocs.size); getCorefStructure(doc.targetCoref)} - **| - val pool = java.util.concurrent.Executors.newFixedThreadPool(options.numThreads) - var accuracy = 0.0 - try { - val trainer = instantiateModel(optimizer, pool) - for (iter <- 0 until options.numTrainingIterations) { - val shuffledDocs = rng.shuffle(trainingFormat) - val batches = shuffledDocs.grouped(options.featureComputationsPerThread*options.numThreads).toSeq - for ((batch, b) <- batches.zipWithIndex) { - if (options.numThreads > 1) trainer.runParallel(batch) - else trainer.runSequential(batch) - } - if (!model.MentionPairFeaturesDomain.dimensionDomain.frozen) model.MentionPairFeaturesDomain.dimensionDomain.freeze() - if (!options.useAdaGradRDA && options.useAverageIterate) optimizer match {case o: ParameterAveraging => o.setWeightsToAverage(model.parameters) } - println("Train docs") - doTest(trainDocs.take((trainDocs.length*options.trainPortionForTest).toInt), wn, "Train") - println("Test docs") - |**("Running Test") - accuracy = doTest(testDocs, wn, "Test") - **|("End Test") - if(saveModelBetweenEpochs && iter % saveFrequency == 0) - serialize(filename + "-" + iter) - if (!options.useAdaGradRDA && options.useAverageIterate) optimizer match {case o: ParameterAveraging => o.unSetWeightsToAverage(model.parameters) } - } - if (!options.useAdaGradRDA&& options.useAverageIterate) optimizer match {case o: ParameterAveraging => o.setWeightsToAverage(model.parameters) } - accuracy - } finally { - pool.shutdown() - } - } - - class CorefTester(scorer: CorefConllOutput, scorerMutex: Object, val pool: ExecutorService){ - def map(doc: Document): Unit = { - assert(doc.targetCoref ne null,"Cannot perform test on document without test key.") - val trueCoref = doc.targetCoref - val predCoref = doc.coref - - predCoref.resetPredictedMapping() - for(mention <- predCoref.mentions) if(mention.attr[MentionCharacteristics] eq null) mention.attr += new MentionCharacteristics(mention, lexicon) - - infer(predCoref) - - val b3 = ClusterF1Evaluation.BCubedNoSingletons(predCoref, trueCoref) - val ce = ClusterF1Evaluation.CeafE(predCoref,trueCoref) - val muc = ClusterF1Evaluation.MUCNoSingletons(predCoref, trueCoref) - val cm = ClusterF1Evaluation.CeafM(predCoref,trueCoref) - - scorerMutex.synchronized { - scorer.microB3.microAppend(b3) - scorer.microCE.microAppend(ce) - scorer.microCM.microAppend(cm) - scorer.microMUC.microAppend(muc) - } - } - def runParallel(ins: Seq[Document]) = cc.factorie.util.Threading.parMap(ins, pool)(map) - def runSequential(ins: Seq[(Document)]) = ins.map(map) - } - - def doTest(testDocs: Seq[Document], wn: WordNet, name: String): Double = { - val scorer = new CorefConllOutput - object ScorerMutex - val pool = java.util.concurrent.Executors.newFixedThreadPool(options.numThreads) - var accuracy = 0.0 - try { - val tester = new CorefTester(scorer, ScorerMutex, pool) - tester.runParallel(testDocs) - println("-----------------------") - println(" * Overall scores") - scorer.printInhouseScore(name) - accuracy = scorer.microMUC.f1 - } finally pool.shutdown() - accuracy - } - - def assertSorted(mentions: Seq[Mention]): Unit = { - for(i <- 0 until mentions.length -1) - assert(mentions(i).phrase.tokens.head.stringStart <= mentions(i+1).phrase.tokens.head.stringStart, "the mentions are not sorted by their position in the document. Error at position " +i+ " of " + mentions.length) - } - - def deserialize(stream: DataInputStream) { - val config = options.getConfigHash - BinarySerializer.deserialize(config, stream) - options.setConfigHash(config) - println("deserializing with config:\n" + options.getConfigHash.iterator.map(x => x._1 + " = " + x._2).mkString("\n")) - model.deserialize(stream) - model.MentionPairFeaturesDomain.dimensionDomain.freeze() - println("model weights 1norm = " + model.parameters.oneNorm) - stream.close() - } - - def deserialize(filename: String) { - deserialize(new DataInputStream(new FileInputStream(filename))) - } - - def serialize(filename: String) { - println("serializing with config:\n" + options.getConfigHash.iterator.map(x => x._1 + " = " + x._2).mkString("\n")) - val stream = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(new File(filename)))) - BinarySerializer.serialize(options.getConfigHash, stream) - model.serialize(stream) - println("model weights 1norm = " + model.parameters.oneNorm) - stream.close() - } -} diff --git a/src/main/scala/cc/factorie/app/nlp/coref/ForwardCorefBase.scala b/src/main/scala/cc/factorie/app/nlp/coref/ForwardCorefBase.scala new file mode 100644 index 0000000..59b3c97 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/coref/ForwardCorefBase.scala @@ -0,0 +1,176 @@ +package cc.factorie.app.nlp.coref + +import java.util.concurrent.ExecutorService + +import cc.factorie.app.nlp.Document +import cc.factorie.optimize.{Example, GradientOptimizer, MiniBatchExample} + +import scala.collection.mutable.ArrayBuffer + +abstract class ForwardCorefBase extends CorefSystem[Seq[MentionPairLabel]] { + val options = new CorefOptions + val model:PairwiseCorefModel + + + /**Store head words which are seen over a default 20 times in the model + * @param trainDocs Documents to generate counts from*/ + def preprocessCorpus(trainDocs:Seq[Document]) = { + val nonPronouns = trainDocs.flatMap(_.targetCoref.mentions.filterNot(m => m.phrase.isPronoun)) + model.CorefTokenFrequencies.counter = new TopTokenFrequencies(TokenFreqs.countWordTypes(nonPronouns,(t) => t.phrase.headToken.string.toLowerCase,20)) + } + + def instantiateModel(optimizer:GradientOptimizer,pool:ExecutorService) = new LeftRightParallelTrainer(optimizer,pool) + + /**Generate the labels used for training + * @param coref This is expected to be the true coreference class for the document + * @return Sequence of training labels for this document*/ + def getCorefStructure(coref:WithinDocCoref): Seq[MentionPairLabel] = { + val mentions = coref.mentions.sortBy(m=>m.phrase.start) + assertSorted(mentions) + val labels = new ArrayBuffer[MentionPairLabel] + for (i <- 0 until mentions.size){ + if(!options.usePronounRules || !mentions(i).phrase.isPronoun) + labels ++= generateTrainingLabelsForOneAnaphor(mentions, i) + } + labels + } + + //copy from coref tester + private def assertSorted(mentions: Seq[Mention]): Unit = { + for(i <- 0 until mentions.length -1) + assert(mentions(i).phrase.tokens.head.stringStart <= mentions(i+1).phrase.tokens.head.stringStart, "the mentions are not sorted by their position in the document. Error at position " +i+ " of " + mentions.length) + } + + /** + * Given the index of a mention, create positive and negative labels for this mention and its prodecessors + * @param orderedMentions Mentions for this document + * @param anaphorIndex Index of current mention to generate labels for + * @return Training Labels for this Mention */ + protected def generateTrainingLabelsForOneAnaphor(orderedMentions: Seq[Mention], anaphorIndex: Int): Seq[MentionPairLabel] = { + val labels = new ArrayBuffer[MentionPairLabel] + val m1 = orderedMentions(anaphorIndex) + var numAntecedents = 0 + var i = anaphorIndex - 1 + while (i >= 0 && (numAntecedents < options.numPositivePairsTrain || !options.pruneNegTrain)) { + val m2 = orderedMentions(i) + val label = m1.entity != null & m1.entity == m2.entity + if (!pruneMentionPairTraining(m1,m2,label,numAntecedents)) { + val cl = new MentionPairLabel(model, m1, m2, orderedMentions, label, options=options) + if(label) numAntecedents += 1 + labels += cl + } + i -= 1 + } + labels + } + case class MentionPairLabelFeatures(label: MentionPairLabel,features: MentionPairFeatures) + + /** Given a sequence of MentionPairLabels for a document, compute features of the pair and return both*/ + protected def generateFeatures(labels: Seq[MentionPairLabel]): Seq[MentionPairLabelFeatures] = { + val previousLabels = new ArrayBuffer[MentionPairLabelFeatures]() + labels.foreach{ label => + val candidateLabelFeatures = label.genFeatures() + //If we want to merge features of our antecedent with any of it's previous mentions, + if(options.mergeFeaturesAtAll && label.mention2.entity != null){ + val matchingPreviousLabelsFeatures = previousLabels.lastIndexWhere(l => l.label.mention2.entity == label.mention2.entity) + if(matchingPreviousLabelsFeatures != -1) mergeFeatures(candidateLabelFeatures, previousLabels(matchingPreviousLabelsFeatures).features) + } + previousLabels += new MentionPairLabelFeatures(label,candidateLabelFeatures) + } + previousLabels + } + + class LeftRightParallelTrainer(optimizer: GradientOptimizer, pool: ExecutorService, miniBatchSize: Int = 1) extends ParallelTrainer(optimizer,pool){ + def map(in: Seq[MentionPairLabel]): Seq[Example] = { + // |**("Adding Features for Labels") + val examples = MiniBatchExample(miniBatchSize,generateFeatures(in).map{trainingInstance => model.getExample(trainingInstance.label,trainingInstance.features,options.slackRescale)}) + // **| + examples + } + } + + def mergeFeatures(l: MentionPairFeatures, mergeables: MentionPairFeatures) { + if (options.mergeFeaturesAtAll) { + assert(l.features.activeCategories.forall(!_.startsWith("NBR"))) + val mergeLeft = ArrayBuffer[MentionPairFeatures]() + l.features ++= mergeables.features.mergeableAllFeatures.map("NBRR_" + _) + } + } + + /**Types of Pairs Pruned during Training + * - cataphora since we do not corefer these + * - Any pair of mentions which overlap each other*/ + def pruneMentionPairTraining(anaphor: Mention,antecedent: Mention,label: Boolean,numAntecedents: Int): Boolean = { + val cataphora = antecedent.phrase.isPronoun && !anaphor.phrase.isPronoun + if(cataphora) { + if (label && !options.allowPosCataphora || !label && !options.allowNegCataphora) { + return true + } + } + if(!anaphor.phrase.tokens.intersect(antecedent.phrase.tokens).isEmpty) return true + if (label && numAntecedents > 0 && !options.pruneNegTrain) return true + return false + } + def pruneMentionPairTesting(anaphor: Mention,antecedent: Mention): Boolean = { + val cataphora = antecedent.phrase.isPronoun && !anaphor.phrase.isPronoun + if(options.usePronounRules && antecedent.phrase.isPronoun) return true + else if(cataphora || options.allowTestCataphora) return true + if(!anaphor.phrase.tokens.intersect(antecedent.phrase.tokens).isEmpty) return true + return false + } + + /**Find each mentions best scoring antecedent. If the antecedent has a cluster add the new mention if not, create a new entity and add both mentions + * Currently does not create singleton entities + * @param coref Expects nontarget coref class that is pre annotated with mentions + * @return + */ + def infer(coref: WithinDocCoref): WithinDocCoref = { + val mentions = coref.mentions.sortBy(m => m.phrase.start) + for (i <- 0 until coref.mentions.size) { + val m1 = mentions(i) + val bestCand = getBestCandidate(coref,mentions, i) + if (bestCand != null) { + if(bestCand.entity ne null){ + bestCand.entity += m1 + } + else{ + val entity = coref.newEntity(); entity += bestCand; entity += m1 + } + }else {val entity = coref.newEntity(); entity += m1} + } + coref + } + + def getBestCandidate(coref: WithinDocCoref, mentions: Seq[Mention], mInt: Int): Mention = { + val candidateLabels = ArrayBuffer[MentionPairFeatures]() + var bestCandidate: Mention = null + var bestScore = Double.MinValue + var anteIdx = mInt + val m1 = mentions(mInt) + var numPositivePairs = 0 + while (anteIdx >= 0 && (numPositivePairs < options.numPositivePairsTest || !options.pruneNegTest)) { + val m2 = mentions(anteIdx) + if (!pruneMentionPairTesting(m1,m2)) { + val candidateLabel = new MentionPairFeatures(model, m1, m2, mentions, options=options) + val mergeables = candidateLabels.lastIndexWhere(l => l.mention2.entity != null &&l.mention2.entity == candidateLabel.mention2.entity) + if(mergeables != -1) mergeFeatures(candidateLabel, candidateLabels(mergeables)) + candidateLabels += candidateLabel + val score = if (m1.phrase.isProperNoun && m1.attr[MentionCharacteristics].nounWords.forall(m2.attr[MentionCharacteristics].nounWords.contains) + && m2.attr[MentionCharacteristics].nounWords.forall(m1.attr[MentionCharacteristics].nounWords.contains) + || options.mergeMentionWithApposition && (m1.phrase.isAppositionOf(m2.phrase) + || m2.phrase.isAppositionOf(m1.phrase))) Double.PositiveInfinity + else model.predict(candidateLabel.value) + if (score > 0.0) { + numPositivePairs += 1 + if (bestScore <= score) { + bestCandidate = m2 + bestScore = score + } + } + } + anteIdx -= 1 + } + bestCandidate + } +} + diff --git a/src/main/scala/cc/factorie/app/nlp/coref/ForwardCorefTrainerOpts.scala b/src/main/scala/cc/factorie/app/nlp/coref/ForwardCorefTrainerOpts.scala new file mode 100644 index 0000000..135f6a8 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/coref/ForwardCorefTrainerOpts.scala @@ -0,0 +1,30 @@ +package cc.factorie.app.nlp.coref + +/** + * Created by andrew@andrewresearch.net on 28/10/17. + */ + +/** Trainers for Coreference Systems*/ +trait ForwardCorefTrainerOpts extends CorefTrainerOpts{ + val numPositivePairsTrain = new CmdOption("prune-train", 2, "INT", "number of positive pairs before pruning instances in training") + val numPositivePairsTest = new CmdOption("prune-test", 100, "INT", "number of positive pairs before pruning instances in testing") + val numThreads = new CmdOption("num-threads", 4, "INT", "Number of threads to use") + val featureComputationsPerThread = new CmdOption("feature-computations-per-thread", 2, "INT", "Number of feature computations per thread to run in parallel during training") + val numTrainingIterations = new CmdOption("num-training-iterations", 4, "INT", "Number of passes through the training data") + val useMIRA = new CmdOption("use-mira", false, "BOOLEAN", "Whether to use MIRA as an optimizer") + val saveFrequency = new CmdOption("save-frequency", 1, "INT", "how often to save the model between epochs") + val trainPortionForTest = new CmdOption("train-portion-for-test", 0.1, "DOUBLE", "When testing on train, what portion to use.") + val mergeFeaturesAtAll = new CmdOption("merge-features-at-all", true, "BOOLEAN", "Whether to merge features") + val conjunctionStyle = new CmdOption("conjunction-style", "NONE", "NONE|HASH|SLOW", "What types of conjunction features to use - options are NONE, HASH, and SLOW (use slow string-based conjunctions).") + val entityLR = new CmdOption("entity-left-right",false,"BOOLEAN","whether to do entity-based pruning in lr search") + val slackRescale = new CmdOption("slack-rescale",2.0,"FLOAT","recall bias for hinge loss") + val useEntityType = new CmdOption("use-entity-type",true,"BOOLEAN","whether to use entity type info") + val mergeAppositions = new CmdOption("merge-appositions",false,"BOOLEAN","whether to merge appositions as a rule") + val usePronounRules = new CmdOption("use-pronoun-rules",false,"BOOLEAN","whether to do deterministic assigning of pronouns and not consider pronouns for training") + val trainSeparatePronounWeights = new CmdOption("separate-pronoun-weights",true,"BOOLEAN","train a separate weight vector for pronoun-pronoun comparison") + val numCompareToTheLeft = new CmdOption("num-compare-to-the-left",75,"INT","number of mentions to compare to the left before backing off to only looking at non-pronouns and those in entities (only used if entityLR == true)") + val learningRate = new CmdOption("learning-rate",1.0,"FLOAT","learning rate") + val serialize = new CmdOption("serialize", "ForwardCoref.factorie", "FILE", "Filename in which to serialize classifier.") + val deserialize = new CmdOption("deserialize", "", "FILE", "Filename from which to deserialize classifier.") + val useAverageIterate = new CmdOption("use-average-iterate", true, "BOOLEAN", "Use the average iterate instead of the last iterate?") +} diff --git a/src/main/scala/cc/factorie/app/nlp/coref/ImplicitFeatureConjunctionTensor.scala b/src/main/scala/cc/factorie/app/nlp/coref/ImplicitFeatureConjunctionTensor.scala deleted file mode 100644 index 95eb2fa..0000000 --- a/src/main/scala/cc/factorie/app/nlp/coref/ImplicitFeatureConjunctionTensor.scala +++ /dev/null @@ -1,132 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.app.nlp.coref - -import cc.factorie.la._ -import cc.factorie.util.{DoubleSeq, SparseDoubleSeq} - -/** - * User: apassos - * Date: 6/27/13 - * Time: 12:21 PM - */ - -/** - * HashConjunctionFeatureTensor is a tensor which implicitly represents all - * conjunctions of features in its baseFeatures member. It never instantiates - * all the conjunctions in memory, and it uses hashing for efficiency. - * @param dim1 - the size of the hash domain for the conjunction features - * @param baseFeatures - the sparse binary tensor from which conjunctions are being computed - */ -class ImplicitFeatureConjunctionTensor(val dim1: Int, val baseFeatures: SparseBinaryTensor, domain: ImplicitDomain) extends Tensor1 with ReadOnlyTensor with SparseDoubleSeq { - def activeDomainSize = baseFeatures.activeDomainSize*baseFeatures.activeDomainSize - def isDense = false - private val _dim1 = dim1 - private val _a = domain.a - private val _b = domain.b - private val _p0 = domain.prime0 - - @inline private def prodIndex(i: Int, j: Int): Int = ((i * dim1 + j) * _a + _b)%_p0 - - @inline private def index(i: Int, j: Int) = { - val res = prodIndex(i, j) % _dim1 - if (res < 0) _dim1 + res else res - } - @inline private def sign(i: Int, j: Int): Int = - 1 - 2 * (prodIndex(i, j) & 1) - - def dot(ds: DoubleSeq) = ds match { - case t: DenseTensor => - val len = baseFeatures.activeDomainSize - val indices = baseFeatures._indices - val arr = t.asArray - var i = 0 - var dot = 0.0 - while (i < len) { - var j = 0 - val ii = indices(i) - while (j < i) { - val ij = indices(j) - dot += arr(index(ii, ij)) * sign(ii, ij) - j += 1 - } - i += 1 - } - dot - case t: Tensor => - val len = baseFeatures.activeDomainSize - val indices = baseFeatures._indices - var i = 0 - var dot = 0.0 - while (i < len) { - var j = 0 - val ii = indices(i) - while (j < i) { - val ij = indices(j) - dot += t(index(ii, ij)) * sign(ii, ij) - j += 1 - } - i += 1 - } - dot - } - def activeDomain = throw new Error("Can't efficiently enumerate the active domain") - def apply(i: Int) = throw new Error("Can't efficiently access a value in a given position") - - /** - * Note: this foreachActiveElement might call the same index twice. - */ - override def foreachActiveElement(f: (Int, Double) => Unit) { - val len = baseFeatures.activeDomainSize - val indices = baseFeatures._indices - var i = 0 - while (i < len) { - var j = 0 - val ii = indices(i) - while (j < i) { - val ij = indices(j) - f(index(ii, ij), sign(ii, ij)) - j += 1 - } - i += 1 - } - } -} - -class ImplicitDomain(baseSize: Int) { - private lazy val dimSize = baseSize - lazy val prime0 = PrimeUtils.getRandomPrime(2*dimSize, 10*dimSize, new java.util.Random(0)) - lazy val a = new java.util.Random(0).nextInt(prime0) - lazy val b = new java.util.Random(1).nextInt(prime0) -} - -object PrimeUtils { - def getRandomPrime(start: Int, end: Int, rand: java.util.Random): Int = { - // println("start: " + start + " end: " + end ) - while (true) { - val candidate = (start + rand.nextInt(end - start)) | 1 - if (isPrime(candidate)) return candidate - } - sys.error("impossible") - } - def isPrime(n: Int): Boolean = { - require(n > 0) - val upto = math.sqrt(n).asInstanceOf[Int] + 1 - var i = 2 - while (i <= upto) { - if (n % i == 0) return false - i += 1 - } - true - } -} diff --git a/src/main/scala/cc/factorie/app/nlp/coref/Mention.scala b/src/main/scala/cc/factorie/app/nlp/coref/Mention.scala index d40db57..1f90921 100644 --- a/src/main/scala/cc/factorie/app/nlp/coref/Mention.scala +++ b/src/main/scala/cc/factorie/app/nlp/coref/Mention.scala @@ -1,61 +1,7 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ package cc.factorie.app.nlp.coref -import cc.factorie._ -import cc.factorie.app.nlp._ -import cc.factorie.app.nlp.phrase._ -import cc.factorie.app.nlp.pos.PennPosDomain -import cc.factorie.util.{Attr, EvaluatableClustering, ImmutableArrayIndexedSeq, UniqueId} -import cc.factorie.variable._ +import cc.factorie.app.nlp.phrase.Phrase -import scala.collection.mutable.ArrayBuffer - -/** Either a mention, entity or sub-entity in an coreference or entity resolution model. - These are the "nodes" in a trees in which observed mentions are the leaves and inferred entities are the roots. - In "hierarchical coreference" there may be additional nodes at intermediate levels of the tree. - @author Andrew McCallum */ -trait Node extends UniqueId with Attr { - type ParentType <: Node - /** A pointer to the Node immediate above this Node in the tree. */ - def parent: ParentType -} - -/** A "mention" of an entity in a resolution problem. - A leaf in a coreference hierarchy. - This is the super-trait for mentions in both within-document coreference and cross-document entity resolution. - @author Andrew McCallum */ -trait AbstractMention extends Node { - def parent: ParentType - /** The root of the coreference tree in which this mention is a leaf. */ - def entity: ParentType - /** A string representation of the observed mention, e.g. "Michael Smith". */ - def string: String -} - -/** An "entity" in an entity resolution problem. - A non-leaf Node in a coreference hierarchy. - It could be a root (entity) or an intermediate node (sub-entity in hierarchical coref). - This is the super-trait for entities in both within-document coreference and cross-document entity resolution. - @author Andrew McCallum */ -trait AbstractEntity extends Node { - def children: Iterable[Node] // Immediate children - def childIds: Iterable[String] = children.map(_.uniqueId) - def mentions: Iterable[AbstractMention] // Leaves of tree -} - - -// Below is infrastructure for within-document coreference // TODO Turn this into a trait. Only concrete will be an inner class of WithinDocCoref /** An entity mention whose contents come from a nlp.phrase.Phrase. @@ -70,214 +16,4 @@ abstract class Mention(val phrase:Phrase) extends AbstractMention { def parent: ParentType = _entity lazy val string = phrase.tokensString(" ") // If number, gender and entity type are needed, put a CategoricalVariable subclass in the Attr -} - -// TODO All three of these classes should be removed. -akm -/** A collection of Mentions, either immutable or mutable. */ -trait MentionCollection extends Iterable[Mention] -/** An immutable ordered collection of Mentions. */ -class MentionList(mentions:Iterable[Mention]) extends ImmutableArrayIndexedSeq(mentions) with MentionCollection -/** An mutable ordered collection of Mentions. */ -class MentionBuffer extends ArrayBuffer[Mention] with MentionCollection - - - -/** An entity whose evidence comes from some Phrases within a single document. - Users should not create these themselves, but rather use WithinDocCoref create them. - The uniqueId is abstract. - @author Andrew McCallum */ -abstract class WithinDocEntity(val document:Document) extends AbstractEntity { - type ParentType = WithinDocEntity - private val _mentions = new scala.collection.mutable.LinkedHashSet[Mention] - def parent: WithinDocEntity = null - def mentions:scala.collection.Set[Mention] = _mentions - def isSingleton:Boolean = _mentions.size == 1 - def isEmpty:Boolean = _mentions.isEmpty - def children: Iterable[Mention] = _mentions - // TODO Rename this to remove the "get". - def getFirstMention: Mention = if(isEmpty) null else if(isSingleton) _mentions.head else mentions.minBy(m => m.phrase.start) - def +=(mention:Mention): Unit = { - assert(mention.phrase.document eq document) - //assert(!_mentions.contains(mention)) // No reason to do this; might catch a bug. - if (mention.entity ne null) mention.entity._mentions -= mention - if(!_mentions.contains(mention))_mentions += mention - mention._setEntity(WithinDocEntity.this) - } - def -=(mention:Mention): Unit = { - assert(mention.phrase.document eq document) - assert(_mentions.contains(mention)) // No reason to do this; might catch a bug. - assert(mention.entity == this) - _mentions -= mention - mention._setEntity(null) - } - - /** Return the canonical mention for the entity cluster. If the canonical mention is not already set it computes, sets, and returns the canonical mention */ - def getCanonicalMention: Mention = { - if (canonicalMention eq null) { - val canonicalOption = _mentions.filter{m => - (m.phrase.attr[NounPhraseType].value == NounPhraseTypeDomain.value("NOM") || - m.phrase.attr[NounPhraseType].value == NounPhraseTypeDomain.value("NAM")) && - m.phrase.last.posTag.intValue != PennPosDomain.posIndex - }.toSeq.sortBy(m => (m.phrase.start, m.phrase.length)).headOption - canonicalMention = canonicalOption.getOrElse(children.headOption.orNull) - canonicalName = canonicalMention.string - } - canonicalMention - } - var canonicalName: String = null - var canonicalMention: Mention = null - // If number, gender and entity type are needed, put a CategoricalVariable subclass in the Attr -} - - - -/** Container for a within-document coreference solution, typically stored as an attr of the Document. - Some may contain an imperfect inferred coref solution; others may store a gold-standard target coref solution. - Concrete instances of Mention and WithinDocEntity are created here. - @author Andrew McCallum - */ -class WithinDocCoref(val document:Document) extends EvaluatableClustering[WithinDocEntity,Phrase#Value] { - /** When we have labeled gold-standard truth for coref, it is stored here. */ - var target: WithinDocCoref = null // ...the alternative would have been to create different subclasses of WithinDocCoref so they could be stored separately in the Document.attr, but I chose this as cleaner. -akm - /** A mapping from (the Phrase's span value) to Mention */ - private val _spanToMention = new scala.collection.mutable.LinkedHashMap[Span[Section,Token],Mention] - //private val _phraseToMention = new scala.collection.mutable.LinkedHashMap[Phrase,Mention] // Used to index by this instead. I think we can remove this now. -akm - /** A mapping from entity.uniqueId to WithinDocEntity */ - private val _entities = new scala.collection.mutable.LinkedHashMap[String,WithinDocEntity] - /** A mapping from entity key (i.e. an Int identifying the true entity) to the entity.uniqueId */ - private lazy val _entityKeyToId = new scala.collection.mutable.HashMap[Int,String] - private var _entityCount = 0 // The number of WithinDocEntities ever created here. This number never goes down. - /** A string that will be used as a prefix on the uniqueIds of the Mentions and WithinDocEntities created here. */ - def uniqueId: String = document.uniqueId // TODO Perhaps this should be something more safely unique if we save more than one WithinDocCoref objects per Document? -akm - def uniqueIdEntitySuffix(entityIndex:Int): String = "//WithinDocEntity" + entityIndex - def uniqueIdMentionSuffix(phraseStart:Int, phraseLength:Int): String = "//Mention(" + phraseStart + "," + phraseLength + ")" - /** Concrete implementation of WithinDocEntity that automatically stores itself in WithinDocCoref.entities. */ - protected class WithinDocEntity1(val uniqueId:String) extends WithinDocEntity(document) { - def this() = this(WithinDocCoref.this.uniqueId + uniqueIdEntitySuffix(_entityCount)) // TODO Is this what we want? -akm - _entityCount += 1 - assert(!_entities.contains(uniqueId)) - _entities(uniqueId) = this - def coref: WithinDocCoref = WithinDocCoref.this - } - /** Concrete implementation of Mention that automatically stores itself in WithinDocCoref.mentions. */ - protected class Mention1(phrase:Phrase, entity:WithinDocEntity) extends Mention(phrase) { - def this(phrase:Phrase, entityKey:Int) = this(phrase, entityFromKey(entityKey)) // Typically used for labeled data - def this(phrase:Phrase, entityUniqueId:String) = this(phrase, entityFromUniqueId(entityUniqueId)) // Typically used for deserialization - def this(phrase:Phrase) = this(phrase, null.asInstanceOf[WithinDocEntity]) // Typically used for new inference // TODO Should this be null, or a newly created blank Entity; See LoadConll2011 also. - assert(entity == null || entity.asInstanceOf[WithinDocEntity1].coref == WithinDocCoref.this) - _spanToMention(phrase.value) = this - val uniqueId = WithinDocCoref.this.uniqueId + uniqueIdMentionSuffix(phrase.start, phrase.length) // TODO Is this what we want? -akm - if (entity ne null) entity += this - def coref: WithinDocCoref = WithinDocCoref.this - } - - /** Given Span (typically the value of a Phrase), return the corresponding Mention. - Note that Span is a case class, so the lookup is done by the span's boundaries, not by its identity. */ - def mention(span:Span[Section,Token]): Option[Mention] = _spanToMention.get(span) - /** Return the Mention corresponding to the given Phrase. If none present, return null. - Note that since the lookup happens by the Phrase's Span value, the returned mention.phrase may be different than this method's argument. */ - def mention(phrase:Phrase): Option[Mention] = _spanToMention.get(phrase.value) - - /** Create a new Mention whose entity will be null. */ - def addMention(phrase:Phrase): Mention = _spanToMention.getOrElse(phrase.value, new Mention1(phrase)) - /** Create a new Mention with entity specified by given uniqueId. */ - def addMention(phrase:Phrase, entityId:String): Mention = { assert(!_spanToMention.contains(phrase.value)); new Mention1(phrase, entityId) } - /** Create a new Mention with entity specified by given key. */ - def addMention(phrase:Phrase, entityKey:Int): Mention = { assert(!_spanToMention.contains(phrase.value)); new Mention1(phrase, entityKey) } - /** Create a new Mention with the given entity, which must also be in this WithinDocCoref */ - def addMention(phrase:Phrase, entity:WithinDocEntity): Mention = new Mention1(phrase, entity) - - /** Remove a Mention from this coreference solution, and from its entity if it has one. */ - def deleteMention(mention:Mention): Unit = { - if (mention.entity ne null) mention.entity -= mention - _spanToMention.remove(mention.phrase.value) - } - - /** Checks whether the given tokenspan overlaps with an existing mention, returns the overlapping mention if it does. */ - def findOverlapping(tokenSpan:TokenSpan):Option[Mention] = tokenSpan match { - case ts if ts.document == this.document => mentions.find(_.phrase.characterOffsets overlapsWith ts.characterOffsets) - case _ => None - } - - /** Return all Mentions in this coreference solution. */ - def mentions: Seq[Mention] = _spanToMention.values.toVector - /** Return a collection of WithinDocEntities managed by this coref solution. Note that some of them may have no Mentions. */ - def entities: Iterable[WithinDocEntity] = _entities.values - /** Create and return a new WithinDocEntity with uniqueId determined by the number entities created so far. */ - def newEntity(): WithinDocEntity = new WithinDocEntity1() - /** Return the entity associated with the given uniqueId, or create a new entity if not found already among 'entities'. */ - def entityFromUniqueId(id:String): WithinDocEntity = _entities.getOrElse(id, new WithinDocEntity1(id)) - /** Return the entity associated with the given key, or create a new entity if not found already among 'entities'. */ - def entityFromKey(key:Int): WithinDocEntity = { - val id = _entityKeyToId.getOrElse(key,null) - val result = if (id eq null) new WithinDocEntity1 else _entities(id) - _entityKeyToId(key) = result.uniqueId - result - } - /** Return the entity associated with the given uniqueId. Return null if not found. */ - def idToEntity(id:String): WithinDocEntity = _entities(id) - /** Remove from the list of entities all entities that contain no mentions. */ - def trimEmptyEntities(): Unit = _entities.values.filter(_.mentions.size == 0).map(_.uniqueId).foreach(_entities.remove) // TODO But note that this doesn't purge _entityKeyToId; perhaps it should. - /** Remove from all entities and mentions associated with entities that contain only one mention. */ - def removeSingletons():Unit ={ - _entities.values.filter(_.mentions.size == 1).map(_.uniqueId).foreach{ - id => - _entities(id).mentions.foreach(m => deleteMention(m)) - _entities.remove(id) - } - } - - /**Reset the clustered entities for this coref solution without losing mentions and their cached properties*/ - def resetPredictedMapping():Unit = {_entities.clear();mentions.foreach(_._setEntity(null));_entityCount = 0 } - - // Support for evaluation - // These assure we ignore any singletons for conll scoring - // TODO: Allow for ACE scoring where singletons are counted - def clusterIds: Iterable[WithinDocEntity] = _entities.values.filterNot(_.isSingleton) - def pointIds: Iterable[Phrase#Value] = _spanToMention.values.filterNot(m => m.entity == null || m.entity.isSingleton).map(_.phrase.value) - def pointIds(entityId:WithinDocEntity): Iterable[Phrase#Value] = if(!entityId.isSingleton) entityId.mentions.map(_.phrase.value) else Seq() - def intersectionSize(entityId1:WithinDocEntity, entityId2:WithinDocEntity): Int = if(!entityId1.isSingleton && !entityId2.isSingleton) entityId1.mentions.map(_.phrase.value).intersect(entityId2.mentions.map(_.phrase.value)).size else 0 - def clusterId(mentionId:Phrase#Value): WithinDocEntity = { - val mention = _spanToMention.getOrElse(mentionId,null) - if(mention == null || mention.entity == null ||mention.entity.isSingleton) null - else mention.entity - } - - -} - - -// CrossDocEntity should be unified with Jack's new hcoref replacement. -// ids, including cross-doc ids will be part of this work. -trait CrossDocMention extends AbstractMention { - def withinDocEntityId: String -} -trait CrossDocEntity extends AbstractEntity // ... - - - -///** Categorical variable indicating whether the mention is a pronoun, nominal or named proper noun. -// (Obviously different from MentionEntityType, which may indicate whether it is a person, location, organization, etc.) */ -//class MentionType(val mention:AbstractMention, targetValue:String) extends LabeledCategoricalVariable(targetValue) { -// def domain = OntonotesMentionTypeDomain -//} -///** The domain of MentionType, consisting of pronouns (PRO), nominals (NOM) and named proper nouns (NAM). */ -//object OntonotesMentionTypeDomain extends CategoricalDomain(List("PRO", "NOM", "NAM")) - - -// // In case we need to put labels on Mentions or Entities in addition to their underlying Phrases. -akm -//class OntonotesEntityType(category:String) extends LabeledCategoricalVariable[String](category) { -// def domain = OntonotesEntityTypeDomain -//} -// -//class PhraseOntonotesEntityType(val phrase:Phrase, value:String) extends OntonotesEntityType(value) -//class EntityOntonotesEntityType(val entity:AbstractEntity, value:String) extends OntonotesEntityType(value) -//class WithinDocEntityOntonotesEntityType(override val entity:WithinDocEntity, value:String) extends EntityOntonotesEntityType(entity, value) -// -//class EntityGender(val entity:AbstractEntity, value:String) extends Gender(value) -//class WithinDocEntityGender(override val entity:WithinDocEntity, value:String) extends EntityGender(entity, value) -////class CrossDocEntityGender(override val entity:CrossDocEntity, value:String) extends EntityGender(entity, value) -// -//class EntityNumber(val entity:AbstractEntity, value:String) extends Number(value) -//class WithinDocEntityNumber(override val entity:WithinDocEntity, value:String) extends EntityNumber(entity, value) - +} \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/nlp/coref/MentionAlignment.scala b/src/main/scala/cc/factorie/app/nlp/coref/MentionAlignment.scala deleted file mode 100644 index d11c425..0000000 --- a/src/main/scala/cc/factorie/app/nlp/coref/MentionAlignment.scala +++ /dev/null @@ -1,171 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.app.nlp.coref - -import cc.factorie.app.nlp._ -import cc.factorie.app.nlp.phrase.{NounPhraseEntityTypeLabeler, OntonotesPhraseEntityType, ParseAndNerBasedPhraseFinder, ParseBasedPhraseFinder} -import cc.factorie.app.nlp.pos.PennPosTag -import cc.factorie.app.nlp.wordnet.WordNet - -import scala.collection.mutable -import scala.collection.mutable.ArrayBuffer - -/** Used for training with predicted mentions. - * If the predicted mention is equal to or within some specified alignment width in options we add the true spans entity label if any - * Otherwise we add the mention to the ground truth coref as a ground truth singleton.*/ -object MentionAlignment { - def makeLabeledData(documents:Seq[Document], outfile: String, useEntityTypes: Boolean, options: CorefOptions, map: DocumentAnnotatorMap): (Seq[Document]) = { - - //remove the gold POS annotation - if(!options.useGoldBoundaries) documents.foreach( d => d.tokens.foreach(t => t.attr.remove[PennPosTag])) - - val shifts = ArrayBuffer[Int]() - shifts += 0 - for(i <- 1 to options.mentionAlignmentShiftWidth){ - shifts += i - shifts += -1*i - } - - //align gold mentions to detected mentions in order to get labels for detected mentions - val alignmentInfo = documents.par.map(d => alignMentions(d,WordNet,useEntityTypes, options, shifts)) - - //do some analysis of the accuracy of this alignment - val numCorrect = alignmentInfo.map(_.numcorrect).sum.toDouble - val numGT = alignmentInfo.map(_.numGT).sum.toDouble - val numDetected = alignmentInfo.map(_.numDetected).sum.toDouble - println("precision = " + numCorrect + " / " + numDetected + " = " + numCorrect/numDetected) - println("recall = " + numCorrect + " / " + numGT + " = " + numCorrect/numGT) - - - documents - } - - def findMentions(doc: Document,options:CorefOptions,annotatorMap: DocumentAnnotatorMap = null) { - if(options.useGoldBoundaries){ - doc.getTargetCoref.mentions.foreach(m => doc.coref.addMention(m.phrase).phrase.attr += m.phrase.attr[OntonotesPhraseEntityType]) - }else if(!options.useNERMentions){ - ParseAndNerBasedPhraseFinder.FILTER_APPOS = true - val map = if(annotatorMap eq null) DocumentAnnotatorPipeline.defaultDocumentAnnotationMap else annotatorMap - DocumentAnnotatorPipeline(map, prereqs=Nil, ParseBasedPhraseFinder.prereqAttrs.toSeq).process(doc) - ParseBasedPhraseFinder.getPhrases(doc).foreach(doc.coref.addMention) - }else { - val defaultMap = if(annotatorMap eq null) DocumentAnnotatorPipeline.defaultDocumentAnnotationMap else annotatorMap - val preReqs = ConllPhraseFinder.prereqAttrs ++ PronounFinder.prereqAttrs ++AcronymNounPhraseFinder.prereqAttrs - DocumentAnnotatorPipeline.apply(map=defaultMap.toMap, prereqs=Nil, preReqs).process(doc) - (ConllPhraseFinder(doc) ++ PronounFinder(doc) ++ AcronymNounPhraseFinder(doc)).foreach(doc.getCoref.addMention) - } - DocumentAnnotatorPipeline.apply(DocumentAnnotatorPipeline.defaultDocumentAnnotationMap, prereqs=Nil, ForwardCoref.prereqAttrs).process(doc) - } - - case class PrecRecReport(numcorrect: Int, numGT: Int, numDetected: Int) - - //for each of the mentions in detectedMentions, this adds a reference to a ground truth entity - //the alignment is based on an **exact match** between the mention boundaries - def alignMentions(gtDoc: Document, wn: WordNet, useEntityTypes: Boolean, options: CorefOptions, shifts: Seq[Int],annotatorMap:DocumentAnnotatorMap = null): (PrecRecReport) = { - val groundTruthMentions = gtDoc.targetCoref.entities.filter(!_.isSingleton).flatMap(e => e.children).toSeq - val relevantGTMentions = groundTruthMentions.size - - //Set predicted mentions on the coref attribute of the document - if(gtDoc.coref.mentions.isEmpty) findMentions(gtDoc,options) - val detectedMentions = gtDoc.getCoref.mentions.toSeq - - val gtSpanHash = mutable.HashMap[(Int,Int),Mention]() - gtSpanHash ++= groundTruthMentions.map(m => ((m.phrase.start, m.phrase.length), m)) - val gtHeadHash = mutable.HashMap[Int,Mention]() - gtHeadHash ++= groundTruthMentions.map(m => (getHeadTokenInDoc(m),m)) - - val gtAligned = mutable.HashMap[Mention,Boolean]() - gtAligned ++= groundTruthMentions.map(m => (m,false)) - - var exactMatches = 0 - var relevantExactMatches = 0 - var unAlignedEntityCount = 0 - val debug = false - - //here, we create a bunch of new entity objects, that differ from the entities that the ground truth mentions point to - //however, we index them by the same uIDs that the ground mentions use - val entityHash = groundTruthMentions.groupBy(m => m.entity).toMap - val falsePositives1 = ArrayBuffer[Mention]() - detectedMentions.foreach(m => { - val alignment = checkContainment(gtSpanHash,gtHeadHash,m, options, shifts) - if(alignment.isDefined){ - val gtMention = alignment.get - val entity = gtMention.entity - //If aligned gold mention was a gold entity - if(entity != null) { - if(entityHash(gtMention.entity).length > 1 && !gtAligned(gtMention)) relevantExactMatches += 1 - gtAligned(gtMention) = true - if(debug) println("aligned: " + gtMention.string +":" + gtMention.phrase.start + " " + m.phrase.string + ":" + m.phrase.start +" " + gtMention.entity.uniqueId) - - exactMatches += 1 - if(options.useEntityType) m.phrase.attr += gtMention.phrase.attr[OntonotesPhraseEntityType] - else NounPhraseEntityTypeLabeler.process(m.phrase) - val newEntity = gtDoc.coref.entityFromUniqueId(gtMention.entity.uniqueId) - newEntity += m - } - //If the aligned gold mention was a loaded singleton, use any annotation information if wanted - else{ - if(options.useEntityType) m.phrase.attr += gtMention.phrase.attr[OntonotesPhraseEntityType] - else NounPhraseEntityTypeLabeler.process(m.phrase) - val newEntity = gtDoc.coref.entityFromUniqueId(gtDoc.name + "-" + gtDoc.targetCoref.entities.size+unAlignedEntityCount) - newEntity += m - unAlignedEntityCount += 1 - falsePositives1 += m - } - //Make the close alignment our new ground truth for training - }else{ - if(debug) println("not aligned: " + m.string + ":" + m.phrase.start) - //Add our mention which was unaligned to the target coref as a singleton for training - m.phrase.attr += new OntonotesPhraseEntityType(m.phrase,"O") - val newEntity = gtDoc.coref.entityFromUniqueId(gtDoc.name + "-" + gtDoc.targetCoref.entities.size+unAlignedEntityCount) - newEntity += m - unAlignedEntityCount += 1 - falsePositives1 += m - } - }) - - val countUnAligned = gtAligned.count(!_._2) - val newCoref = new WithinDocCoref(gtDoc) - newCoref.target = gtDoc.coref - //So we don't have to perform mention finding twice - gtDoc.coref.mentions.foreach(m => newCoref.addMention(m.phrase)) - gtDoc.attr += newCoref - - new PrecRecReport(relevantGTMentions-countUnAligned,relevantGTMentions,detectedMentions.length) - } - - def getHeadTokenInDoc(m: Mention): Int = m.phrase.start + m.phrase.headTokenOffset - - def checkContainment(startLengthHash: mutable.HashMap[(Int,Int),Mention], headHash: mutable.HashMap[Int,Mention] ,m: Mention, options: CorefOptions, shifts: Seq[Int]): Option[Mention] = { - val start = m.phrase.start - val length = m.phrase.length - val headTokIdxInDoc = m.phrase.headTokenOffset + m.phrase.start - val startIdx = start - val endIdx = start + length - - for (startShift <- shifts; endShift <- shifts; if startIdx + startShift <= endIdx + endShift) { - val newStart = startIdx + startShift - val newEnd = endIdx + endShift - val key = (newStart, newEnd - newStart) - if(startLengthHash.contains(key)) - return Some(startLengthHash(key)) - } - - //next, back off to aligning it based on the head token - if(headHash.contains(headTokIdxInDoc)) - return Some(headHash(headTokIdxInDoc)) - None - } -} - - diff --git a/src/main/scala/cc/factorie/app/nlp/coref/MentionBuffer.scala b/src/main/scala/cc/factorie/app/nlp/coref/MentionBuffer.scala new file mode 100644 index 0000000..a6f2c62 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/coref/MentionBuffer.scala @@ -0,0 +1,6 @@ +package cc.factorie.app.nlp.coref + +import scala.collection.mutable.ArrayBuffer + +/** An mutable ordered collection of Mentions. */ +class MentionBuffer extends ArrayBuffer[Mention] with MentionCollection diff --git a/src/main/scala/cc/factorie/app/nlp/coref/MentionCharacteristics.scala b/src/main/scala/cc/factorie/app/nlp/coref/MentionCharacteristics.scala new file mode 100644 index 0000000..93e3f80 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/coref/MentionCharacteristics.scala @@ -0,0 +1,68 @@ +package cc.factorie.app.nlp.coref + +import cc.factorie.app.nlp.lexicon.{StaticLexicons, StopWords} +import cc.factorie.app.nlp.phrase.{Gender, NounPhraseType, Number, OntonotesPhraseEntityType} +import cc.factorie.app.nlp.wordnet.WordNet + +/** Various lazily-evaluated cached characteristics of a Mention, typically attached to a Mention as an attr. */ +class MentionCharacteristics(val mention: Mention, lexicon:StaticLexicons) { + // TODO These should be cleaned up and made more efficient -akm + lazy val isPRO = CorefFeatures.posTagsSet.contains(mention.phrase.headToken.posTag.categoryValue) + lazy val isProper = CorefFeatures.properSet.contains(mention.phrase.headToken.posTag.categoryValue) + lazy val isNoun = CorefFeatures.nounSet.contains(mention.phrase.headToken.posTag.categoryValue) + lazy val isPossessive = CorefFeatures.posSet.contains(mention.phrase.headToken.posTag.categoryValue) + + lazy val hasSpeakWord = mention.phrase.exists(s => lexicon.iesl.Say.contains(s.string)) + lazy val hasSpeakWordContext = prev.exists(w => lexicon.iesl.Say.containsWord(w)) || follow.exists(w => lexicon.iesl.Say.containsWord(w)) + lazy val wnLemma = WordNet.lemma(mention.phrase.headToken.string, "n") + lazy val wnSynsets = WordNet.synsets(wnLemma).toSet + lazy val wnHypernyms = WordNet.hypernyms(wnLemma) + lazy val wnAntonyms = wnSynsets.flatMap(_.antonyms()).toSet + lazy val nounWords: Set[String] = + mention.phrase.tokens.filter(_.posTag.categoryValue.startsWith("N")).map(t => t.string.toLowerCase).toSet + lazy val lowerCaseHead: String = mention.phrase.headToken.string.toLowerCase + lazy val lowerCaseString:String = mention.phrase.string.toLowerCase + lazy val headPhraseTrim: String = mention.phrase.tokensString(" ").trim + lazy val nonDeterminerWords: Seq[String] = + mention.phrase.tokens.filterNot(_.posTag.categoryValue == "DT").map(t => t.string.toLowerCase) + lazy val initials: String = + mention.phrase.tokens.map(_.string).filterNot(lexicon.iesl.OrgSuffix.contains).filter(t => t(0).isUpper).map(_(0)).mkString("") + lazy val predictEntityType: Int = mention.phrase.attr[OntonotesPhraseEntityType].intValue + lazy val demonym: String = lexicon.iesl.DemonymMap.getOrElse(headPhraseTrim, "") + + lazy val capitalization: Char = { + if (mention.phrase.length == 1 && mention.phrase.head.positionInSentence == 0) 'u' // mention is the first word in sentence + else { + val s = mention.phrase.value.filter(_.posTag.categoryValue.startsWith("N")).map(_.string.trim) // TODO Fix this slow String operation + if (s.forall(_.forall(_.isUpper))) 'a' + else if (s.forall(t => t.head.isLetter && t.head.isUpper)) 't' + else 'f' + } + } + lazy val gender = mention.phrase.attr[Gender].categoryValue + lazy val number = mention.phrase.attr[Number].categoryValue + lazy val nounPhraseType = mention.phrase.attr[NounPhraseType].categoryValue + lazy val genderIndex = mention.phrase.attr[Gender].intValue + lazy val numberIndex = mention.phrase.attr[Number].intValue + lazy val nounPhraseTypeIndex = mention.phrase.attr[NounPhraseType].intValue + lazy val headPos = mention.phrase.headToken.posTag.categoryValue + lazy val inParens = mention.phrase.sentence.tokens.exists(t => t.posTag.categoryValue == "LRB" && t.positionInSection < mention.phrase.start) + lazy val prev = Vector(TokenFreqs.getTokenStringAtOffset(mention.phrase(0),-1), TokenFreqs.getTokenStringAtOffset(mention.phrase(0),-2)) + lazy val follow = Vector(TokenFreqs.getTokenStringAtOffset(mention.phrase.last,1), TokenFreqs.getTokenStringAtOffset(mention.phrase.last,2)) + + lazy val acronym: Set[String] = { + if (mention.phrase.length == 1) + Set.empty + else { + val alt1 = mention.phrase.value.map(_.string.trim).filter(_.exists(_.isLetter)) // tokens that have at least one letter character + val alt2 = alt1.filterNot(t => StopWords.contains(t.toLowerCase)) // alt1 tokens excluding stop words + val alt3 = alt1.filter(_.head.isUpper) // alt1 tokens that are capitalized + val alt4 = alt2.filter(_.head.isUpper) + Seq(alt1, alt2, alt3, alt4).map(_.map(_.head).mkString.toLowerCase).toSet + } + } + + lazy val canonicalizedPronounOrType = + if (isPRO) PronounSets.canonicalForms.getOrElse(lowerCaseString,lowerCaseHead) + else nounPhraseType +} \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/nlp/coref/MentionCollection.scala b/src/main/scala/cc/factorie/app/nlp/coref/MentionCollection.scala new file mode 100644 index 0000000..43a22f4 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/coref/MentionCollection.scala @@ -0,0 +1,5 @@ +package cc.factorie.app.nlp.coref + +// TODO All three of these classes should be removed. -akm +/** A collection of Mentions, either immutable or mutable. */ +trait MentionCollection extends Iterable[Mention] diff --git a/src/main/scala/cc/factorie/app/nlp/coref/MentionList.scala b/src/main/scala/cc/factorie/app/nlp/coref/MentionList.scala new file mode 100644 index 0000000..17a940f --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/coref/MentionList.scala @@ -0,0 +1,7 @@ +package cc.factorie.app.nlp.coref + +import cc.factorie.util.ImmutableArrayIndexedSeq + +/** An immutable ordered collection of Mentions. */ +class MentionList(mentions:Iterable[Mention]) extends ImmutableArrayIndexedSeq(mentions) with MentionCollection + diff --git a/src/main/scala/cc/factorie/app/nlp/coref/MentionPairFeatures.scala b/src/main/scala/cc/factorie/app/nlp/coref/MentionPairFeatures.scala new file mode 100644 index 0000000..1117736 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/coref/MentionPairFeatures.scala @@ -0,0 +1,221 @@ +/* Copyright (C) 2008-2016 University of Massachusetts Amherst. + This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) + http://factorie.cs.umass.edu, http://github.com/factorie + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ +package cc.factorie.app.nlp.coref + +import cc.factorie.app.nlp.Token +import cc.factorie.app.nlp.lexicon.{LexiconsProvider, StaticLexicons} +import cc.factorie.la.{GrowableSparseBinaryTensor1, SparseTensor} +import cc.factorie.variable.BinaryFeatureVectorVariable + +/** Contains two possible mention sets: + * Lexical & Conventional + * Conventional - String Match, Gender Cross, Head word / entity Type etc + * Lexical - Anaphoricity Detection if mention1 == mention2, else Lexical features for the pair + * A binary feature vector for the features of a mention pair. + Here, mention1 is the mention to the right. */ +class MentionPairFeatures(val model: CorefModel, val mention1:Mention, val mention2:Mention, mentions: Seq[Mention], options: CorefOptions) extends BinaryFeatureVectorVariable[String] { + { + val t = new GrowableSparseBinaryTensor1(domain.dimensionDomain) + val sizeBoundary = if (options.featureSet == "conventional"){ + if (options.conjunctionStyle == ConjunctionOptions.SLOW_CONJUNCTIONS) 650 + else 70 //Count of features needed plus any neighbor merges + } else{ + if (options.conjunctionStyle == ConjunctionOptions.PRON_CONJUNCTIONS) 40 + else 16 + } + t.sizeHint(sizeBoundary) + set(t)(null) + } + + // todo fix this + @deprecated("This exists to preserve prior behavior, it should be a constructor argument", "10/5/15") + val lexicon = new StaticLexicons()(LexiconsProvider.classpath()) + + def domain = model.MentionPairFeaturesDomain + override def skipNonCategories = true + val features = this + var basicFeatureCalculated = false + var conjunctionCalculated = false + val mergeableFeatures = collection.mutable.Set[String]() + def bin(value: Int, bins: Seq[Int]): Int = math.signum(value) * (bins :+ Int.MaxValue).indexWhere(_ > math.abs(value)) + val pfx = mentType(mention1) +":" + mentType(mention2) + def mentType(mention:Mention): String = if (mention.phrase.isPronoun) "pro" else "non" + + computeFeatures() + + def computeFeatures() { + if(options.featureSet == "lexical") + computeLexicalFeatures() + else computeConventionalFeatures() + } + + def addFeature(f: String) { + if(options.trainSeparatePronounWeights){ + features += pfx + "-" + f + }else features += f + } + + + def addFeatureWithPronConjunction(featLabel: String) { + addFeature(featLabel) + addFeature(featLabel + "C=" + mention1.attr[MentionCharacteristics].canonicalizedPronounOrType) + if (mention1 != mention2) addFeature("P=" + mention2.attr[MentionCharacteristics].canonicalizedPronounOrType) + } + + def addMergeableFeature(f: String) { + if (options.mergeFeaturesAtAll) { + assert(mergeableFeatures ne null) + assert(f ne null) + mergeableFeatures += f + } + addFeature(f) + } + + def computeConjunctionFeatures() { + if (basicFeatureCalculated && !conjunctionCalculated) { + if (options.conjunctionStyle == ConjunctionOptions.SLOW_CONJUNCTIONS) { + val activeDomainSize = features.value.activeDomainSize + val basicFeats = features.value.asInstanceOf[SparseTensor]._indices + //Note: this doesnt quite work with hash domains + for (a <- 0 until activeDomainSize - 1) { + for (b <- a + 1 until activeDomainSize) { + val sb = new StringBuilder + sb.append(basicFeats(a)); sb.append("_&&_"); sb.append(basicFeats(b)) + addFeature(sb.toString()) + } + } + } + conjunctionCalculated = true + } + } + + lazy val mergeableAllFeatures = mergeableFeatures + + def computeConventionalFeatures() { + val m1 = if(mention1.attr[MentionCharacteristics] eq null){ mention1.attr += new MentionCharacteristics(mention1, lexicon); mention1.attr[MentionCharacteristics]} else mention1.attr[MentionCharacteristics] + val m2 = if(mention2.attr[MentionCharacteristics] eq null){ mention2.attr += new MentionCharacteristics(mention2, lexicon); mention2.attr[MentionCharacteristics]} else mention2.attr[MentionCharacteristics] + if (basicFeatureCalculated) return + + addMergeableFeature("BIAS") + addMergeableFeature("gmc" + m1.gender + "" + m2.gender) + addMergeableFeature("nms" + m1.number + "" + m2.number) + if (m1.nonDeterminerWords == m2.nonDeterminerWords) + addMergeableFeature("hms") + else addMergeableFeature("hmsf") + addMergeableFeature("mt1" + m1.headPos) + addFeature("mt2" + m2.headPos) + if (!m1.nounWords.intersect(m2.nounWords).isEmpty) + addMergeableFeature("pmhm") + else addMergeableFeature("pmhmf") + if (m1.lowerCaseString.contains(m2.lowerCaseString) || m2.lowerCaseString.contains(m1.lowerCaseString)) + addMergeableFeature("sh") + else addMergeableFeature("shf") + if (CorefFeatures.canBeAliases(mention1, mention2)) addMergeableFeature("sapetc") else addMergeableFeature("soonAliasPredETypeCached:false") + if (m1.wnSynsets.exists(m2.wnSynsets.contains)) + addMergeableFeature("asyn") + else addMergeableFeature("asynf") + if (m1.wnSynsets.exists(m2.wnAntonyms.contains)) + addMergeableFeature("aan") + else addMergeableFeature("aanf") + if (m1.wnSynsets.exists(m2.wnHypernyms.contains) || m2.wnSynsets.exists(m1.wnHypernyms.contains)) + addMergeableFeature("ahyp") + else addMergeableFeature("ahypf") + if (m1.wnHypernyms.exists(m2.wnHypernyms.contains)) addMergeableFeature("hsh") + else addMergeableFeature("hshf") + if (CorefFeatures.areAppositive(mention1, mention2)) + addMergeableFeature("aA") + else addMergeableFeature("aAf") + if (m1.hasSpeakWord && m2.hasSpeakWord) + addMergeableFeature("bs") + else addMergeableFeature("bsf") + if (CorefFeatures.areRelative(mention1, mention2)) + addMergeableFeature("rpf") + else addMergeableFeature("rpff") + for (cm <- CorefFeatures.countCompatibleMentionsBetween(mention1, mention2, mentions.toSeq)) addMergeableFeature("cmc" + cm) + addMergeableFeature("mtpw" + (if (m2.isPRO) m2.headPos + mention1.phrase.headToken.string else m2.headPos + m1.headPos)) + addMergeableFeature("pwhe" + CorefFeatures.proWordHead(mention1,mention2)) + addMergeableFeature("etm" + CorefFeatures.entityTypeMatch(mention1,mention2)) + addMergeableFeature("lhp" + CorefFeatures.headWordsCross(mention1, mention2, model)) + if (mention1.phrase.sentence == mention2.phrase.sentence) addMergeableFeature("ss") // false values of this feature are not included in Roth's system + CorefFeatures.matchingTokensRelations(mention1, mention2, lexicon).foreach(r => addMergeableFeature("apr" + r)) + + if (mention1.phrase.head.string.toLowerCase == mention2.phrase.head.string.toLowerCase) + addMergeableFeature("bM") + else addMergeableFeature("bMf") + if (mention1.phrase.last.string.toLowerCase == mention2.phrase.last.string.toLowerCase) + addMergeableFeature("eM") + else addMergeableFeature("eMf") + if (mention1.phrase.head.string == mention2.phrase.head.string) + addMergeableFeature("bMc") + else addMergeableFeature("bMcf") + if (mention1.phrase.last.string == mention2.phrase.last.string) + addMergeableFeature("eMc") + else addMergeableFeature("eMcf") + if (m1.isPRO) addMergeableFeature("pa") else addMergeableFeature("paf") + + val binTokenSentenceDistances = false + val sdist = bin(mention1.phrase.sentence.indexInSection - mention2.phrase.sentence.indexInSection, 1 to 10) + if (binTokenSentenceDistances) for (sd <- 1 to sdist) addMergeableFeature("sd" + sd) + else addMergeableFeature("sd" + sdist) + val tdist = bin(mention1.phrase.start - mention2.phrase.start, Seq(1, 2, 3, 4, 5, 10, 20, 50, 100, 200)) + if (binTokenSentenceDistances) for (td <- 1 to tdist) addMergeableFeature("td" + td) + else addMergeableFeature("td" + tdist) + if (m1.demonym != "" && m1.demonym == m2.demonym) addMergeableFeature("dM") else addMergeableFeature("dMf") + addMergeableFeature("cap" + m1.capitalization +"_" + m2.capitalization) + addMergeableFeature("hpos" + mention2.phrase.headToken.posTag.value + "_" + mention1.phrase.headToken.posTag.value) + addMergeableFeature("am" + CorefFeatures.acronymMatch(mention1,mention2)) + basicFeatureCalculated = true + } + + def computeLexicalFeatures(): Unit = { + val m1 = if(mention1.attr[MentionCharacteristics] eq null){ mention1.attr += new MentionCharacteristics(mention1, lexicon); mention1.attr[MentionCharacteristics]} else mention1.attr[MentionCharacteristics] + val m2 = if(mention2.attr[MentionCharacteristics] eq null){ mention2.attr += new MentionCharacteristics(mention2, lexicon); mention2.attr[MentionCharacteristics]} else mention2.attr[MentionCharacteristics] + if (basicFeatureCalculated) return + + features += "Bias" //+ currMention.mType + val newEntity = mention1 == mention2 + addFeatureWithPronConjunction("Len=" + mention1.phrase.tokens.size + "|NE=" + newEntity) + val counts = model.CorefTokenFrequencies.counter + addFeatureWithPronConjunction("HdWd=" + returnWord(mention1.phrase.headToken,counts,counts.headWords) + "|NE=" + newEntity) + addFeatureWithPronConjunction("First=" + returnWord(mention1.phrase(0),counts,counts.firstWords) + "|NE=" + newEntity) + addFeatureWithPronConjunction("Last=" + returnWord(mention1.phrase.tokens.last,counts,counts.lastWords) + "|NE=" + newEntity) + addFeatureWithPronConjunction("Class=" + returnWordForm(mention1.phrase(0),counts) + "|NE=" +newEntity) + addFeature("Pos=" + mention1.phrase.sentence.indexInSection + "|NE=" + newEntity) + if(!newEntity){ + features += "PrevLen=" + mention2.phrase.tokens.size + addFeatureWithPronConjunction("PrevHead=" + returnWord(mention2.phrase.headToken,counts,counts.headWords)) + addFeatureWithPronConjunction("PrevHeadShape=" + returnShape(mention2.phrase.headToken,counts)) + addFeatureWithPronConjunction("PrevFirst=" + returnWord(mention2.phrase(0),counts,counts.firstWords)) + addFeatureWithPronConjunction("PrevLast=" + returnWord(mention2.phrase.last,counts,counts.lastWords)) + addFeatureWithPronConjunction("PrevPrec=" + returnWord(TokenFreqs.getTokenAtOffset(mention2.phrase(0),-1),counts,counts.precContext)) + addFeatureWithPronConjunction("PrevFollow=" + returnWord(TokenFreqs.getTokenAtOffset(mention2.phrase.last,+1),counts,counts.followContext)) + + //Pair Features + var dist = mention1.phrase.sentence.indexInSection - mention2.phrase.sentence.indexInSection + if(dist <10) addFeature("sent_dist=" + dist.toString) + dist = mention1.phrase.start - mention2.phrase.start + if(dist < 10) addFeature("mention_dist=" + dist.toString) + + if(m1.lowerCaseString == m2.lowerCaseString) addFeature("String_Match") + else addFeature("No_String_Match") + if(m1.lowerCaseHead == m2.lowerCaseHead) addFeature("Head_Match") + else addFeature("No_Head_Match") + + addFeature("curr-type" + m1.predictEntityType + "|link-type" + m2.predictEntityType) + addFeature("gmc1" + m1.genderIndex + "|gmc2"+m2.genderIndex) + } + } + private def returnWord(token: Token, counter: TopTokenFrequencies, category: DefaultHashMap[String,Int]): String = if(token == null) "NA" else counter.containsToken(category,token) + private def returnShape(token: Token, counter: TopTokenFrequencies): String = counter.containsString(counter.shapes,cc.factorie.app.strings.stringShape(token.string, 2)) + private def returnWordForm(token: Token,counter: TopTokenFrequencies): String = counter.containsString(counter.wordForm,TokenFreqs.getWordClass(token)) +} \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/nlp/coref/MentionPairLabel.scala b/src/main/scala/cc/factorie/app/nlp/coref/MentionPairLabel.scala index e7a71ae..202b64f 100644 --- a/src/main/scala/cc/factorie/app/nlp/coref/MentionPairLabel.scala +++ b/src/main/scala/cc/factorie/app/nlp/coref/MentionPairLabel.scala @@ -1,302 +1,9 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ package cc.factorie.app.nlp.coref -import cc.factorie.app.nlp.lexicon.{LexiconsProvider, StaticLexicons} -import cc.factorie.app.nlp.Token -import cc.factorie.la.{GrowableSparseBinaryTensor1, SparseTensor} -import cc.factorie.variable.{BinaryFeatureVectorVariable, LabeledCategoricalVariable} +import cc.factorie.variable.LabeledCategoricalVariable -import scala.collection.mutable - -/** Contains two possible mention sets: - * Lexical & Conventional - * Conventional - String Match, Gender Cross, Head word / entity Type etc - * Lexical - Anaphoricity Detection if mention1 == mention2, else Lexical features for the pair - * A binary feature vector for the features of a mention pair. - Here, mention1 is the mention to the right. */ -class MentionPairFeatures(val model: CorefModel, val mention1:Mention, val mention2:Mention, mentions: Seq[Mention], options: CorefOptions) extends BinaryFeatureVectorVariable[String] { - { - val t = new GrowableSparseBinaryTensor1(domain.dimensionDomain) - val sizeBoundary = if (options.featureSet == "conventional"){ - if (options.conjunctionStyle == ConjunctionOptions.SLOW_CONJUNCTIONS) 650 - else 70 //Count of features needed plus any neighbor merges - } else{ - if (options.conjunctionStyle == ConjunctionOptions.PRON_CONJUNCTIONS) 40 - else 16 - } - t.sizeHint(sizeBoundary) - set(t)(null) - } - - // todo fix this - @deprecated("This exists to preserve prior behavior, it should be a constructor argument", "10/5/15") - val lexicon = new StaticLexicons()(LexiconsProvider.classpath()) - - def domain = model.MentionPairFeaturesDomain - override def skipNonCategories = true - val features = this - var basicFeatureCalculated = false - var conjunctionCalculated = false - val mergeableFeatures = collection.mutable.Set[String]() - def bin(value: Int, bins: Seq[Int]): Int = math.signum(value) * (bins :+ Int.MaxValue).indexWhere(_ > math.abs(value)) - val pfx = mentType(mention1) +":" + mentType(mention2) - def mentType(mention:Mention): String = if (mention.phrase.isPronoun) "pro" else "non" - - computeFeatures() - - def computeFeatures() { - if(options.featureSet == "lexical") - computeLexicalFeatures() - else computeConventionalFeatures() - } - - def addFeature(f: String) { - if(options.trainSeparatePronounWeights){ - features += pfx + "-" + f - }else features += f - } - - - def addFeatureWithPronConjunction(featLabel: String) { - addFeature(featLabel) - addFeature(featLabel + "C=" + mention1.attr[MentionCharacteristics].canonicalizedPronounOrType) - if (mention1 != mention2) addFeature("P=" + mention2.attr[MentionCharacteristics].canonicalizedPronounOrType) - } - - def addMergeableFeature(f: String) { - if (options.mergeFeaturesAtAll) { - assert(mergeableFeatures ne null) - assert(f ne null) - mergeableFeatures += f - } - addFeature(f) - } - - def computeConjunctionFeatures() { - if (basicFeatureCalculated && !conjunctionCalculated) { - if (options.conjunctionStyle == ConjunctionOptions.SLOW_CONJUNCTIONS) { - val activeDomainSize = features.value.activeDomainSize - val basicFeats = features.value.asInstanceOf[SparseTensor]._indices - //Note: this doesnt quite work with hash domains - for (a <- 0 until activeDomainSize - 1) { - for (b <- a + 1 until activeDomainSize) { - val sb = new StringBuilder - sb.append(basicFeats(a)); sb.append("_&&_"); sb.append(basicFeats(b)) - addFeature(sb.toString()) - } - } - } - conjunctionCalculated = true - } - } - - lazy val mergeableAllFeatures = mergeableFeatures - - def computeConventionalFeatures() { - val m1 = if(mention1.attr[MentionCharacteristics] eq null){ mention1.attr += new MentionCharacteristics(mention1, lexicon); mention1.attr[MentionCharacteristics]} else mention1.attr[MentionCharacteristics] - val m2 = if(mention2.attr[MentionCharacteristics] eq null){ mention2.attr += new MentionCharacteristics(mention2, lexicon); mention2.attr[MentionCharacteristics]} else mention2.attr[MentionCharacteristics] - if (basicFeatureCalculated) return - - addMergeableFeature("BIAS") - addMergeableFeature("gmc" + m1.gender + "" + m2.gender) - addMergeableFeature("nms" + m1.number + "" + m2.number) - if (m1.nonDeterminerWords == m2.nonDeterminerWords) - addMergeableFeature("hms") - else addMergeableFeature("hmsf") - addMergeableFeature("mt1" + m1.headPos) - addFeature("mt2" + m2.headPos) - if (!m1.nounWords.intersect(m2.nounWords).isEmpty) - addMergeableFeature("pmhm") - else addMergeableFeature("pmhmf") - if (m1.lowerCaseString.contains(m2.lowerCaseString) || m2.lowerCaseString.contains(m1.lowerCaseString)) - addMergeableFeature("sh") - else addMergeableFeature("shf") - if (CorefFeatures.canBeAliases(mention1, mention2)) addMergeableFeature("sapetc") else addMergeableFeature("soonAliasPredETypeCached:false") - if (m1.wnSynsets.exists(m2.wnSynsets.contains)) - addMergeableFeature("asyn") - else addMergeableFeature("asynf") - if (m1.wnSynsets.exists(m2.wnAntonyms.contains)) - addMergeableFeature("aan") - else addMergeableFeature("aanf") - if (m1.wnSynsets.exists(m2.wnHypernyms.contains) || m2.wnSynsets.exists(m1.wnHypernyms.contains)) - addMergeableFeature("ahyp") - else addMergeableFeature("ahypf") - if (m1.wnHypernyms.exists(m2.wnHypernyms.contains)) addMergeableFeature("hsh") - else addMergeableFeature("hshf") - if (CorefFeatures.areAppositive(mention1, mention2)) - addMergeableFeature("aA") - else addMergeableFeature("aAf") - if (m1.hasSpeakWord && m2.hasSpeakWord) - addMergeableFeature("bs") - else addMergeableFeature("bsf") - if (CorefFeatures.areRelative(mention1, mention2)) - addMergeableFeature("rpf") - else addMergeableFeature("rpff") - for (cm <- CorefFeatures.countCompatibleMentionsBetween(mention1, mention2, mentions.toSeq)) addMergeableFeature("cmc" + cm) - addMergeableFeature("mtpw" + (if (m2.isPRO) m2.headPos + mention1.phrase.headToken.string else m2.headPos + m1.headPos)) - addMergeableFeature("pwhe" + CorefFeatures.proWordHead(mention1,mention2)) - addMergeableFeature("etm" + CorefFeatures.entityTypeMatch(mention1,mention2)) - addMergeableFeature("lhp" + CorefFeatures.headWordsCross(mention1, mention2, model)) - if (mention1.phrase.sentence == mention2.phrase.sentence) addMergeableFeature("ss") // false values of this feature are not included in Roth's system - CorefFeatures.matchingTokensRelations(mention1, mention2, lexicon).foreach(r => addMergeableFeature("apr" + r)) - - if (mention1.phrase.head.string.toLowerCase == mention2.phrase.head.string.toLowerCase) - addMergeableFeature("bM") - else addMergeableFeature("bMf") - if (mention1.phrase.last.string.toLowerCase == mention2.phrase.last.string.toLowerCase) - addMergeableFeature("eM") - else addMergeableFeature("eMf") - if (mention1.phrase.head.string == mention2.phrase.head.string) - addMergeableFeature("bMc") - else addMergeableFeature("bMcf") - if (mention1.phrase.last.string == mention2.phrase.last.string) - addMergeableFeature("eMc") - else addMergeableFeature("eMcf") - if (m1.isPRO) addMergeableFeature("pa") else addMergeableFeature("paf") - - val binTokenSentenceDistances = false - val sdist = bin(mention1.phrase.sentence.indexInSection - mention2.phrase.sentence.indexInSection, 1 to 10) - if (binTokenSentenceDistances) for (sd <- 1 to sdist) addMergeableFeature("sd" + sd) - else addMergeableFeature("sd" + sdist) - val tdist = bin(mention1.phrase.start - mention2.phrase.start, Seq(1, 2, 3, 4, 5, 10, 20, 50, 100, 200)) - if (binTokenSentenceDistances) for (td <- 1 to tdist) addMergeableFeature("td" + td) - else addMergeableFeature("td" + tdist) - if (m1.demonym != "" && m1.demonym == m2.demonym) addMergeableFeature("dM") else addMergeableFeature("dMf") - addMergeableFeature("cap" + m1.capitalization +"_" + m2.capitalization) - addMergeableFeature("hpos" + mention2.phrase.headToken.posTag.value + "_" + mention1.phrase.headToken.posTag.value) - addMergeableFeature("am" + CorefFeatures.acronymMatch(mention1,mention2)) - basicFeatureCalculated = true - } - - def computeLexicalFeatures(): Unit = { - val m1 = if(mention1.attr[MentionCharacteristics] eq null){ mention1.attr += new MentionCharacteristics(mention1, lexicon); mention1.attr[MentionCharacteristics]} else mention1.attr[MentionCharacteristics] - val m2 = if(mention2.attr[MentionCharacteristics] eq null){ mention2.attr += new MentionCharacteristics(mention2, lexicon); mention2.attr[MentionCharacteristics]} else mention2.attr[MentionCharacteristics] - if (basicFeatureCalculated) return - - features += "Bias" //+ currMention.mType - val newEntity = mention1 == mention2 - addFeatureWithPronConjunction("Len=" + mention1.phrase.tokens.size + "|NE=" + newEntity) - val counts = model.CorefTokenFrequencies.counter - addFeatureWithPronConjunction("HdWd=" + returnWord(mention1.phrase.headToken,counts,counts.headWords) + "|NE=" + newEntity) - addFeatureWithPronConjunction("First=" + returnWord(mention1.phrase(0),counts,counts.firstWords) + "|NE=" + newEntity) - addFeatureWithPronConjunction("Last=" + returnWord(mention1.phrase.tokens.last,counts,counts.lastWords) + "|NE=" + newEntity) - addFeatureWithPronConjunction("Class=" + returnWordForm(mention1.phrase(0),counts) + "|NE=" +newEntity) - addFeature("Pos=" + mention1.phrase.sentence.indexInSection + "|NE=" + newEntity) - if(!newEntity){ - features += "PrevLen=" + mention2.phrase.tokens.size - addFeatureWithPronConjunction("PrevHead=" + returnWord(mention2.phrase.headToken,counts,counts.headWords)) - addFeatureWithPronConjunction("PrevHeadShape=" + returnShape(mention2.phrase.headToken,counts)) - addFeatureWithPronConjunction("PrevFirst=" + returnWord(mention2.phrase(0),counts,counts.firstWords)) - addFeatureWithPronConjunction("PrevLast=" + returnWord(mention2.phrase.last,counts,counts.lastWords)) - addFeatureWithPronConjunction("PrevPrec=" + returnWord(TokenFreqs.getTokenAtOffset(mention2.phrase(0),-1),counts,counts.precContext)) - addFeatureWithPronConjunction("PrevFollow=" + returnWord(TokenFreqs.getTokenAtOffset(mention2.phrase.last,+1),counts,counts.followContext)) - - //Pair Features - var dist = mention1.phrase.sentence.indexInSection - mention2.phrase.sentence.indexInSection - if(dist <10) addFeature("sent_dist=" + dist.toString) - dist = mention1.phrase.start - mention2.phrase.start - if(dist < 10) addFeature("mention_dist=" + dist.toString) - - if(m1.lowerCaseString == m2.lowerCaseString) addFeature("String_Match") - else addFeature("No_String_Match") - if(m1.lowerCaseHead == m2.lowerCaseHead) addFeature("Head_Match") - else addFeature("No_Head_Match") - - addFeature("curr-type" + m1.predictEntityType + "|link-type" + m2.predictEntityType) - addFeature("gmc1" + m1.genderIndex + "|gmc2"+m2.genderIndex) - } - } - private def returnWord(token: Token, counter: TopTokenFrequencies, category: DefaultHashMap[String,Int]): String = if(token == null) "NA" else counter.containsToken(category,token) - private def returnShape(token: Token, counter: TopTokenFrequencies): String = counter.containsString(counter.shapes,cc.factorie.app.strings.stringShape(token.string, 2)) - private def returnWordForm(token: Token,counter: TopTokenFrequencies): String = counter.containsString(counter.wordForm,TokenFreqs.getWordClass(token)) -} - -class MentionPairLabel(val model: PairwiseCorefModel, val mention1:Mention, val mention2:Mention, mentions: Seq[Mention], val initialValue: Boolean, options: CorefOptions) extends LabeledCategoricalVariable(if (initialValue) "YES" else "NO") { +class MentionPairLabel(val model: PairwiseCorefModel, val mention1:Mention, val mention2:Mention, mentions: Seq[Mention], val initialValue: Boolean, options: CorefOptions) extends LabeledCategoricalVariable(if (initialValue) "YES" else "NO") { def domain = model.MentionPairLabelDomain - def genFeatures():MentionPairFeatures = new MentionPairFeatures(model, mention1, mention2, mentions, options) -} - - -class TopTokenFrequencies(val headWords: DefaultHashMap[String,Int], - val firstWords: DefaultHashMap[String,Int] = null, - val lastWords: DefaultHashMap[String,Int] = null, - val precContext: DefaultHashMap[String,Int] = null, - val followContext: DefaultHashMap[String,Int] = null, - val shapes: DefaultHashMap[String,Int] = null, - val wordForm: DefaultHashMap[String,Int] = null, default: Int = 20) { - def this(nonPronouns: Seq[Mention],typesOfCounts: Seq[String], default:Int) = this( - if(typesOfCounts.contains("Head")) TokenFreqs.countWordTypes(nonPronouns,(t) => t.phrase.headToken.string.toLowerCase,default) else null, - if(typesOfCounts.contains("First")) TokenFreqs.countWordTypes(nonPronouns,(t) => t.phrase.tokens(0).string.toLowerCase,default)else null, - if(typesOfCounts.contains("Last")) TokenFreqs.countWordTypes(nonPronouns,(t) => t.phrase.last.string.toLowerCase,default)else null, - if(typesOfCounts.contains("Prec")) TokenFreqs.countWordTypes(nonPronouns,(t) => TokenFreqs.getTokenStringAtOffset(t.phrase.tokens(0),-1).toLowerCase,default)else null, - if(typesOfCounts.contains("Follow")) TokenFreqs.countWordTypes(nonPronouns,(t) => TokenFreqs.getTokenStringAtOffset(t.phrase.last,1).toLowerCase,default)else null, - if(typesOfCounts.contains("Shape")) TokenFreqs.countWordTypes(nonPronouns,(t) => cc.factorie.app.strings.stringShape(t.phrase.string,2),default)else null, - if(typesOfCounts.contains("WordForm")) TokenFreqs.countWordTypes(nonPronouns,(t) => TokenFreqs.getWordClass(t.phrase.headToken),default)else null) - - - //If this token is not a top token, fall back on using pos tag - def containsToken(lexicon: DefaultHashMap[String,Int],token: Token): String = { - if(lexicon.contains(token.string.toLowerCase)) token.string.toLowerCase - else token.posTag.categoryValue - } - def containsString(lexicon: DefaultHashMap[String,Int],tokenString: String): String = if(lexicon.contains(tokenString)) tokenString else "" + def genFeatures(): MentionPairFeatures = new MentionPairFeatures(model, mention1, mention2, mentions, options) } - - -object TokenFreqs{ - def countWordTypes(nonPronouns: Seq[Mention],specificWordFunc: (Mention) => String, cutoff: Int): DefaultHashMap[String,Int] = { - countAndPrune(nonPronouns.map(specificWordFunc),cutoff) - } - - private def countAndPrune(words: Seq[String], cutoff: Int): DefaultHashMap[String,Int] = { - val counts = new DefaultHashMap[String,Int](0) - words.foreach(key=>counts(key) += 1) - counts.foreach{case (key,value) => if(value < cutoff) counts.remove(key)} - counts - } - - def getTokenAtOffset(token: Token,offset: Int): Token = { val t = token.next(offset); if (t ne null) t else null } - def getTokenStringAtOffset(token: Token,offset: Int): String = { val t = token.next(offset); if (t ne null) t.string else ""} - - def getWordClass(word: Token):String = { - val sb = new StringBuilder - if (word.isCapitalized) { - if (word.containsLowerCase) sb.append("Cap-Mix") - else sb.append("Cap") - } - if (word.isDigits) sb.append("Num") - else if (word.containsDigit) sb.append("Num-Mix") - if (word.string.contains('-')) sb.append("Dash") - if (word.string.contains('s') && word.string.length() >= 3) sb.append("-S") - else if (word.string.length() >= 5){ - val lowerCase = word.string.toLowerCase - if (lowerCase.endsWith("ed")) sb.append("-ed") - else if (lowerCase.endsWith("ing")) sb.append("-ing") - else if (lowerCase.endsWith("ion")) sb.append("-ion") - else if (lowerCase.endsWith("er")) sb.append("-er") - else if (lowerCase.endsWith("est")) sb.append("-est") - else if (lowerCase.endsWith("ly")) sb.append("-ly") - else if (lowerCase.endsWith("ity")) sb.append("-ity") - else if (lowerCase.endsWith("y")) sb.append("-y") - else sb.append("-none") - } - sb.toString() - } -} - -class DefaultHashMap[String,Int](val defaultValue: Int) extends mutable.HashMap[String,Int] { - override def default(key:String) = defaultValue -} - diff --git a/src/main/scala/cc/factorie/app/nlp/coref/MentionPhraseFinder.scala b/src/main/scala/cc/factorie/app/nlp/coref/MentionPhraseFinder.scala index 79c082b..1374cdc 100644 --- a/src/main/scala/cc/factorie/app/nlp/coref/MentionPhraseFinder.scala +++ b/src/main/scala/cc/factorie/app/nlp/coref/MentionPhraseFinder.scala @@ -1,111 +1,25 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ package cc.factorie.app.nlp.coref import cc.factorie.app.nlp.Document -import cc.factorie.app.nlp.phrase.{NounPhraseEntityTypeLabeler, Phrase, NounPhraseType, ConllPhraseEntityType} -import cc.factorie.app.nlp.pos.{PennPosDomain, PennPosTag} -import cc.factorie.app.nlp.Token -import cc.factorie.app.nlp.ner._ -import scala.collection.mutable - -/** Trait for objects that return a list of Phrases given a Document - whose annotations includes those classes listed in prereqAttrs. - This is not a DocumentAnnotator because it does not add its results to the Document.attr; - invocations to its apply method simple return a collection of Phrases. - - This design was chosen because these phrases are often used for coreference - in which there are many coreference-specific choices of what mentions are filtered - or included, and we didn't want to pollute the Document.attr with a tremendous number - of postAttrs that are specific to individual coreference solutions. - - If you really want a DocumentAnnotator that saves its results, it is easy to - create one uses a PhraseFinder. - - @author Andrew McCallum - */ +import cc.factorie.app.nlp.phrase.Phrase + +/** Trait for objects that return a list of Phrases given a Document + *whose annotations includes those classes listed in prereqAttrs. + *This is not a DocumentAnnotator because it does not add its results to the Document.attr; + *invocations to its apply method simple return a collection of Phrases. + ** + This design was chosen because these phrases are often used for coreference + *in which there are many coreference-specific choices of what mentions are filtered + *or included, and we didn't want to pollute the Document.attr with a tremendous number + *of postAttrs that are specific to individual coreference solutions. + ** + If you really want a DocumentAnnotator that saves its results, it is easy to + *create one uses a PhraseFinder. + ** + @author Andrew McCallum + */ trait MentionPhraseFinder { def prereqAttrs: Seq[Class[_]] //def phrasePostAttrs: Seq[Class[_]] // TODO Should we have something like this? def apply(document:Document): Seq[Phrase] } - - -/** Apply returns a list of pronoun phrases, given PennPosTags. - @author Andrew McCallum */ -object PronounFinder extends MentionPhraseFinder { - def prereqAttrs = Seq(classOf[PennPosTag]) - def apply(document:Document): Seq[Phrase] = { - val phrases = document.tokens.filter(_.attr[PennPosTag].isPersonalPronoun).map(t => new Phrase(t.section, start=t.positionInSection, length=1,offsetToHeadToken = -1)).toSeq - for (phrase <- phrases) phrase.attr += new NounPhraseType(phrase, "PRO") - phrases - } -} - -class NerPhraseFinder[Span <: NerSpan] extends MentionPhraseFinder { - val prereqAttrs = Seq(classOf[NerSpanBuffer[Span]]) - def apply(doc:Document):Seq[Phrase] = - doc.attr[NerSpanBuffer[Span]].map(new Phrase(_)) -} - -object AnyNerPhraseFinder extends NerPhraseFinder[NerSpan] -object ConllPhraseFinder extends NerPhraseFinder[ConllNerSpan] -object OntonotesPhraseFinder extends NerPhraseFinder[OntonotesNerSpan] - -/** Apply returns a list of acronym noun phrases. - @author Andrew McCallum */ -object AcronymNounPhraseFinder extends MentionPhraseFinder { - def prereqAttrs = Seq(classOf[Token]) - def apply(doc:Document): Seq[Phrase] = { - val result = new mutable.ArrayBuffer[Phrase] - for (section <- doc.sections; token <- section.tokens) { - // Matches middle word of "Yesterday IBM announced" but not "OBAMA WINS ELECTION" - if ( token.string.length > 2 && !token.containsLowerCase && Character.isUpperCase(token.string(0)) && (token.getNext ++ token.getPrev).exists(_.containsLowerCase)) { - val phrase = new Phrase(section, token.positionInSection, length=1,offsetToHeadToken = -1) - phrase.attr += new ConllPhraseEntityType(phrase, "ORG") - phrase.attr += new NounPhraseType(phrase, "NAM") - result += phrase - } - } - result - } -} - -/** Apply returns a list of NNP-indicated proper noun phrases, given PennPosTags. - @author Andrew McCallum */ -object NnpPosNounPhraseFinder extends MentionPhraseFinder { - def prereqAttrs = Seq(classOf[PennPosTag]) - def apply(doc:Document): Seq[Phrase] = { - val result = new mutable.ArrayBuffer[Phrase] - var start = 0 - for (section <- doc.sections) { - val tokens = section.tokens - while (start < tokens.length) { - val token = tokens(start) - var end = start - while (end < tokens.length && tokens(end).posTag.intValue == PennPosDomain.nnpIndex) end += 1 - if (end != start && tokens(end-1).posTag.intValue == PennPosDomain.nnpIndex) { - val phrase = new Phrase(section, token.positionInSection, length=end-start,offsetToHeadToken = -1) - phrase.attr += new NounPhraseType(phrase, "NAM") - NounPhraseEntityTypeLabeler.process(phrase) - } - start = math.max(start+1, end) - } - } - result - } -} - - - - diff --git a/src/main/scala/cc/factorie/app/nlp/coref/NerForwardCoref.scala b/src/main/scala/cc/factorie/app/nlp/coref/NerForwardCoref.scala new file mode 100644 index 0000000..7a4ae7b --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/coref/NerForwardCoref.scala @@ -0,0 +1,21 @@ +package cc.factorie.app.nlp.coref + +import cc.factorie.app.nlp.Document +import cc.factorie.app.nlp.phrase._ + + +/** Forward Coreference on Ner and Pronoun Mentions*/ +class NerForwardCoref extends ForwardCoref { + override def prereqAttrs: Seq[Class[_]] = (ConllPhraseFinder.prereqAttrs ++ AcronymNounPhraseFinder.prereqAttrs++PronounFinder.prereqAttrs ++ NnpPosNounPhraseFinder.prereqAttrs ++ ForwardCoref.prereqAttrs).distinct + override def annotateMentions(document:Document): Unit = { + if(document.coref.mentions.isEmpty) (ConllPhraseFinder(document) ++ PronounFinder(document) ++ NnpPosNounPhraseFinder(document)++ AcronymNounPhraseFinder(document)).distinct.foreach(phrase => document.getCoref.addMention(phrase)) + document.coref.mentions.foreach(mention => NounPhraseEntityTypeLabeler.process(mention.phrase)) + document.coref.mentions.foreach(mention => NounPhraseGenderLabeler.process(mention.phrase)) + document.coref.mentions.foreach(mention => NounPhraseNumberLabeler.process(mention.phrase)) + } +} + +object NerForwardCoref extends NerForwardCoref { + //val stream:InputStream = new DataInputStream(ClasspathURL[NerForwardCoref](".factorie").openConnection().getInputStream) + //deserialize(stream) +} \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/nlp/coref/NerPhraseFinder.scala b/src/main/scala/cc/factorie/app/nlp/coref/NerPhraseFinder.scala new file mode 100644 index 0000000..09807c2 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/coref/NerPhraseFinder.scala @@ -0,0 +1,11 @@ +package cc.factorie.app.nlp.coref + +import cc.factorie.app.nlp.Document +import cc.factorie.app.nlp.ner.{NerSpan, NerSpanBuffer} +import cc.factorie.app.nlp.phrase.Phrase + +class NerPhraseFinder[Span <: NerSpan] extends MentionPhraseFinder { + val prereqAttrs = Seq(classOf[NerSpanBuffer[Span]]) + def apply(doc:Document):Seq[Phrase] = + doc.attr[NerSpanBuffer[Span]].map(new Phrase(_)) +} diff --git a/src/main/scala/cc/factorie/app/nlp/coref/Node.scala b/src/main/scala/cc/factorie/app/nlp/coref/Node.scala new file mode 100644 index 0000000..9f31f19 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/coref/Node.scala @@ -0,0 +1,14 @@ +package cc.factorie.app.nlp.coref + +import cc.factorie.util.{Attr, UniqueId} + +/** Either a mention, entity or sub-entity in an coreference or entity resolution model. + * These are the "nodes" in a trees in which observed mentions are the leaves and inferred entities are the roots. + * In "hierarchical coreference" there may be additional nodes at intermediate levels of the tree. + * + * @author Andrew McCallum */ +trait Node extends UniqueId with Attr { + type ParentType <: Node + /** A pointer to the Node immediate above this Node in the tree. */ + def parent: ParentType +} diff --git a/src/main/scala/cc/factorie/app/nlp/coref/PairwiseCorefModel.scala b/src/main/scala/cc/factorie/app/nlp/coref/PairwiseCorefModel.scala index 0033425..01b8560 100644 --- a/src/main/scala/cc/factorie/app/nlp/coref/PairwiseCorefModel.scala +++ b/src/main/scala/cc/factorie/app/nlp/coref/PairwiseCorefModel.scala @@ -1,232 +1,9 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ package cc.factorie.app.nlp.coref -import java.io._ - -import cc.factorie.{la, _} -import cc.factorie.la.{DenseTensor1, SparseBinaryTensor, Tensor1, WeightsMapAccumulator} -import cc.factorie.model.Parameters +import cc.factorie.app +import cc.factorie.la.Tensor1 import cc.factorie.optimize.{Example, OptimizableObjectives, PredictorExample} -import cc.factorie.util.{BinarySerializer, DoubleAccumulator} -import cc.factorie.variable.{CategoricalDomain, CategoricalVectorDomain, DiscreteDomain, VectorDomain} - -trait CorefModel extends Parameters { - val MentionPairFeaturesDomain = new CategoricalVectorDomain[String] { - dimensionDomain.maxSize = 1e6.toInt - dimensionDomain.growPastMaxSize = false - } - val MentionPairCrossFeaturesDomain = new VectorDomain { - def dimensionDomain: DiscreteDomain = new DiscreteDomain(5e6.toInt + 1) - } - - val MentionPairLabelDomain = new CategoricalDomain[String] { this += "YES"; this += "NO"; freeze() } - - object CorefTokenFrequencies{ - var counter:TopTokenFrequencies = null - } - - def deserialize(stream: DataInputStream) { - val headWords = new DefaultHashMap[String,Int](0) - BinarySerializer.deserialize(headWords, stream) - BinarySerializer.deserialize(MentionPairFeaturesDomain, stream) - BinarySerializer.deserialize(new CategoricalVectorDomain[String] { val domain = new CategoricalDomain[String]} , stream) - BinarySerializer.deserialize(this, stream) - CorefTokenFrequencies.counter = new TopTokenFrequencies(headWords) - stream.close() - MentionPairFeaturesDomain.freeze() - } - - def deserialize(filename: String) { - deserialize(new DataInputStream(new BufferedInputStream(new FileInputStream(filename)))) - } - - def serialize(stream: DataOutputStream) { - BinarySerializer.serialize(CorefTokenFrequencies.counter.headWords,stream) - MentionPairFeaturesDomain.freeze() - BinarySerializer.serialize(MentionPairFeaturesDomain , stream) - BinarySerializer.serialize(new CategoricalVectorDomain[String] { val domain = new CategoricalDomain[String]}, stream) - BinarySerializer.serialize(this,stream) - } - -} abstract class PairwiseCorefModel extends app.classify.backend.OptimizablePredictor[Double,Tensor1] with CorefModel{ def getExample(label: MentionPairLabel,features:MentionPairFeatures, scale: Double): Example = new PredictorExample(this, features.value, if (label.target.categoryValue == "YES") 1 else -1, OptimizableObjectives.hingeScaledBinary(1.0, 3.0)) } - -class BaseCorefModel extends PairwiseCorefModel { - val pairwise = Weights(new la.DenseTensor1(MentionPairFeaturesDomain.dimensionDomain.maxSize)) - def predict(pairwiseStats: Tensor1) = pairwise.value dot pairwiseStats - def accumulateObjectiveGradient(accumulator: WeightsMapAccumulator, features: Tensor1, gradient: Double, weight: Double) = accumulator.accumulate(pairwise, features, gradient * weight) -} - -class ImplicitCrossProductCorefModel extends PairwiseCorefModel { - val products = Weights(new DenseTensor1(MentionPairCrossFeaturesDomain.dimensionDomain.size)) - val pairwise = Weights(new la.DenseTensor1(MentionPairFeaturesDomain.dimensionDomain.maxSize)) - val domain = new ImplicitDomain(MentionPairFeaturesDomain.dimensionSize) - def predict(pairwiseStats: Tensor1) = - pairwise.value.dot(pairwiseStats) + products.value.dot(new ImplicitFeatureConjunctionTensor(MentionPairCrossFeaturesDomain.dimensionSize, pairwiseStats.asInstanceOf[SparseBinaryTensor], domain)) - def accumulate(acc: WeightsMapAccumulator, pairwiseStats: Tensor1, f: Double) { - acc.accumulate(pairwise, pairwiseStats, f) - acc.accumulate(products, new ImplicitFeatureConjunctionTensor( - MentionPairCrossFeaturesDomain.dimensionSize, pairwiseStats.asInstanceOf[SparseBinaryTensor], domain), f) - } - - def accumulateObjectiveGradient(accumulator: WeightsMapAccumulator, features: Tensor1, gradient: Double, weight: Double) = { - accumulator.accumulate(pairwise, features, gradient) - accumulator.accumulate(products, new ImplicitFeatureConjunctionTensor( - MentionPairCrossFeaturesDomain.dimensionSize, features.asInstanceOf[SparseBinaryTensor], domain), gradient * weight) - } -} - -class StructuredCorefModel extends CorefModel { - val pairwiseWeights = Weights(new la.DenseTensor1(MentionPairFeaturesDomain.dimensionDomain.maxSize)) - - def predict(pairwiseStats: Tensor1) = pairwiseWeights.value dot pairwiseStats - - def getExample(mentionGraph: MentionGraph): Seq[Example] = { - Seq(new GraphExample(this, mentionGraph)) - } - - def getExamples(graphs: Seq[MentionGraph]): Seq[Example] = { - graphs.map{g => new GraphExample(this,g)} - } - - override def deserialize(stream: DataInputStream) { - val firstWords = new DefaultHashMap[String,Int](0) - val headWords = new DefaultHashMap[String,Int](0) - val lastWords = new DefaultHashMap[String,Int](0) - val precContext = new DefaultHashMap[String,Int](0) - val followContext = new DefaultHashMap[String,Int](0) - val wordForm = new DefaultHashMap[String,Int](0) - val shapes = new DefaultHashMap[String,Int](0) - BinarySerializer.deserialize(headWords,stream) - BinarySerializer.deserialize(firstWords,stream) - BinarySerializer.deserialize(lastWords,stream) - BinarySerializer.deserialize(precContext,stream) - BinarySerializer.deserialize(followContext,stream) - BinarySerializer.deserialize(shapes,stream) - BinarySerializer.deserialize(wordForm,stream) - BinarySerializer.deserialize(MentionPairFeaturesDomain, stream) - BinarySerializer.deserialize(new CategoricalVectorDomain[String] { val domain = new CategoricalDomain[String]} , stream) - BinarySerializer.deserialize(this, stream) - val newLexicalCounts = new TopTokenFrequencies(headWords,firstWords,lastWords,precContext,followContext,shapes,wordForm) - stream.close() - CorefTokenFrequencies.counter = newLexicalCounts - MentionPairFeaturesDomain.freeze() - } - - override def serialize(stream: DataOutputStream) { - MentionPairFeaturesDomain.freeze() - BinarySerializer.serialize(CorefTokenFrequencies.counter.headWords,stream) - BinarySerializer.serialize(CorefTokenFrequencies.counter.firstWords,stream) - BinarySerializer.serialize(CorefTokenFrequencies.counter.lastWords,stream) - BinarySerializer.serialize(CorefTokenFrequencies.counter.precContext,stream) - BinarySerializer.serialize(CorefTokenFrequencies.counter.followContext,stream) - BinarySerializer.serialize(CorefTokenFrequencies.counter.shapes,stream) - BinarySerializer.serialize(CorefTokenFrequencies.counter.wordForm,stream) - BinarySerializer.serialize(MentionPairFeaturesDomain, stream) - BinarySerializer.serialize(new CategoricalVectorDomain[String] { val domain = new CategoricalDomain[String]}, stream) - BinarySerializer.serialize(this,stream) - } - - def normAntecedents(scores: Array[Double]): Array[Double] = { - val antecedents = scores.map(Math.exp) - val total = antecedents.reduce(_+_) - for(anteIn <- 0 until antecedents.length) { - antecedents(anteIn) /= total - } - antecedents - } - - def scoreGraph(mentionGraph: MentionGraph): Array[Array[Double]] = { - val scores = new Array[Array[Double]](mentionGraph.graph.length) - for (i <- 0 until mentionGraph.graph.length) { - scores(i) = new Array[Double](i+1) - for (j <- 0 until mentionGraph.graph(i).length) { - if(mentionGraph.prunedEdges(i)(j)) scores(i)(j) = Double.NegativeInfinity - else{ - require(mentionGraph.graph(i)(j).features.domain.dimensionSize > 0) - scores(i)(j) = predict(mentionGraph.graph(i)(j).features.value) - } - } - } - scores - } - - def calculateMarginals(scores:Array[Array[Double]],mentionGraph: MentionGraph, gold: Boolean = false):Array[Array[Double]] = { - val marginals = new Array[Array[Double]](mentionGraph.graph.length) - for (i <- 0 until mentionGraph.graph.length) { - var normalizer = 0.0 - val goldAntecedents = if (gold) mentionGraph.graph(i).filter(p => p != null && p.initialValue) else null - marginals(i) = Array.fill(mentionGraph.graph(i).length)(0.0) - for(edgeIdx<- 0 until mentionGraph.graph(i).length){ - if(!mentionGraph.prunedEdges(i)(edgeIdx)){ - val edge = mentionGraph.graph(i)(edgeIdx) - if (!gold || goldAntecedents.contains(edge)) { - //pair loss score is set at graph generation - val unnormalizedProb = Math.exp(scores(i)(edgeIdx) - edge.lossScore) - marginals(i)(edgeIdx) = unnormalizedProb - normalizer += unnormalizedProb - } - else - marginals(i)(edgeIdx) = 0.0 - } - else - marginals(i)(edgeIdx) = 0.0 - } - for(edgeIdx<- 0 until mentionGraph.graph(i).length){ - marginals(i)(edgeIdx) = if(normalizer == 0) 0.0 else marginals(i)(edgeIdx) / normalizer - } - } - marginals - } - - def computeLikelihood( mentionGraph: MentionGraph, goldMarginal: Array[Array[Double]],predictedMarginalScores: Array[Array[Double]]): Double = { - var likelihood = 0.0 - for (currIdx <- 0 until mentionGraph.graph.length) { - val currMention = mentionGraph.orderedMentionList(currIdx) - var goldAntecedents = if(currMention.entity ne null) currMention.entity.mentions.filter(m => m.phrase.start < currMention.phrase.start) else Iterable.empty - if(goldAntecedents.isEmpty) goldAntecedents = Set(currMention) - var currProb = 0.0 - for (linkIdx <- 0 until goldAntecedents.size) { - if(currIdx == -1 || linkIdx == -1 || mentionGraph.prunedEdges(currIdx)(linkIdx)) currProb += 0.0 - else currProb += goldMarginal(currIdx)(linkIdx) - predictedMarginalScores(currIdx)(linkIdx) - } - var currLogProb = Math.log(currProb) - if (currLogProb.isInfinite) - currLogProb = -30 - likelihood += currLogProb - } - likelihood - } -} - -class GraphExample[Output, Prediction, Input<:MentionGraph](model: StructuredCorefModel, input: Input) extends Example { - def accumulateValueAndGradient(value: DoubleAccumulator, gradient: WeightsMapAccumulator) { - val scores = model.scoreGraph(input) - val predictionMarginalScores = model.calculateMarginals(scores,input,gold = false) - val goldMarginalScores = model.calculateMarginals(scores,input,gold = true) - val likelihood = model.computeLikelihood(input,goldMarginalScores,predictionMarginalScores) - if (value != null) value.accumulate(likelihood) - for (i <- 0 until input.graph.length) { - for (edgeIdx <- 0 until input.graph(i).length; if !input.prunedEdges(i)(edgeIdx)) { - if(gradient != null){ - gradient.accumulate(model.pairwiseWeights, input.graph(i)(edgeIdx).features.value, goldMarginalScores(i)(edgeIdx) - predictionMarginalScores(i)(edgeIdx)) - } - } - } - } -} - diff --git a/src/main/scala/cc/factorie/app/nlp/coref/ParseForwardCoref.scala b/src/main/scala/cc/factorie/app/nlp/coref/ParseForwardCoref.scala new file mode 100644 index 0000000..9c81f15 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/coref/ParseForwardCoref.scala @@ -0,0 +1,20 @@ +package cc.factorie.app.nlp.coref + +import cc.factorie.app.nlp.Document +import cc.factorie.app.nlp.phrase.{NounPhraseEntityTypeLabeler, NounPhraseGenderLabeler, NounPhraseNumberLabeler, ParseAndNerBasedPhraseFinder} + +/**Forward Coreference on Proper Noun, Pronoun and Common Noun Mentions*/ +class ParseForwardCoref extends ForwardCoref { + override def prereqAttrs: Seq[Class[_]] = ParseAndNerBasedPhraseFinder.prereqAttrs.toSeq ++ ForwardCoref.prereqAttrs + override def annotateMentions(document:Document): Unit = { + if(document.coref.mentions.isEmpty) ParseAndNerBasedPhraseFinder.getPhrases(document).foreach(document.coref.addMention) + document.coref.mentions.foreach(mention => NounPhraseEntityTypeLabeler.process(mention.phrase)) + document.coref.mentions.foreach(mention => NounPhraseGenderLabeler.process(mention.phrase)) + document.coref.mentions.foreach(mention => NounPhraseNumberLabeler.process(mention.phrase)) + } +} + +object ParseForwardCoref extends ParseForwardCoref { + //TODO This is commented out for now, but needs investigation + //deserialize(new DataInputStream(ClasspathURL[ParseForwardCoref](".factorie").openConnection().getInputStream)) +} \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/nlp/coref/PronounFinder.scala b/src/main/scala/cc/factorie/app/nlp/coref/PronounFinder.scala new file mode 100644 index 0000000..3e349c2 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/coref/PronounFinder.scala @@ -0,0 +1,17 @@ +package cc.factorie.app.nlp.coref + +import cc.factorie.app.nlp.Document +import cc.factorie.app.nlp.phrase.{NounPhraseType, Phrase} +import cc.factorie.app.nlp.pos.PennPosTag + +/** Apply returns a list of pronoun phrases, given PennPosTags. + * + * @author Andrew McCallum */ +object PronounFinder extends MentionPhraseFinder { + def prereqAttrs = Seq(classOf[PennPosTag]) + def apply(document:Document): Seq[Phrase] = { + val phrases = document.tokens.filter(_.attr[PennPosTag].isPersonalPronoun).map(t => new Phrase(t.section, start=t.positionInSection, length=1,offsetToHeadToken = -1)).toSeq + for (phrase <- phrases) phrase.attr += new NounPhraseType(phrase, "PRO") + phrases + } +} diff --git a/src/main/scala/cc/factorie/app/nlp/coref/PronounSets.scala b/src/main/scala/cc/factorie/app/nlp/coref/PronounSets.scala new file mode 100644 index 0000000..1086e17 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/coref/PronounSets.scala @@ -0,0 +1,77 @@ +package cc.factorie.app.nlp.coref + +import scala.collection.mutable + +object PronounSets { + val firstPerson = Set("i", "me", "myself", "mine", "my", "we", "us", "ourself", "ourselves", "ours", "our") + val secondPerson = Set("you", "yourself", "yours", "your", "yourselves") + val thirdPerson = Set("he", "him", "himself", "his", "she", "herself", "hers", "her", "it", "itself", "its", "one", "oneself", "one's", "they", "them", "themself", "themselves", "theirs", "their", "'em") + val other = Set("who", "whom", "whose", "where", "when", "which") + + val demonstrative = Set("this", "that", "these", "those") + + val singular = Set("i", "me", "myself", "mine", "my", "yourself", "he", "him", "himself", "his", "she", "her", "herself", "hers", "her", "it", "itself", "its", "one", "oneself", "one's") + val plural = Set("we", "us", "ourself", "ourselves", "ours", "our", "yourself", "yourselves", "they", "them", "themself", "themselves", "theirs", "their") + val male = Set("he", "him", "himself", "his") + val female = Set("her", "hers", "herself", "she") + + val reflexive = Set("herself", "himself", "itself", "themselves", "yourselves", "oneself", "yourself", "themself", "myself") + + val neuter = Set("it", "its", "itself", "this", "that", "anything", "something", "everything", "nothing", "which", "what", "whatever", "whichever") + val personal = Set("you", "your", "yours", "i", "me", "my", "mine", "we", "our", "ours", "us", "myself", "ourselves", "themselves", "themself", "ourself", "oneself", "who", "whom", "whose", "whoever", "whomever", "anyone", "anybody", "someone", "somebody", "everyone", "everybody", "nobody") + + val allPronouns = firstPerson ++ secondPerson ++ thirdPerson ++ other + val allPersonPronouns = allPronouns -- neuter + val canonicalForms = new mutable.HashMap[String,String](){ + ("i", "i") + ("i", "i") + ("me", "i") + ("my", "i") + ("myself", "i") + ("mine", "i") + ("you", "you") + ("your", "you") + ("yourself", "you") + ("yourselves", "you") + ("yours", "you") + ("he", "he") + ("him", "he") + ("his", "he") + ("himself", "he") + ("she", "she") + ("her", "she") + ("herself", "she") + ("hers", "she") + ("we", "we") + ("us", "we") + ("our", "we") + ("ourself", "we") + ("ourselves", "we") + ("ours", "we") + ("they", "they") + ("them", "they") + ("their", "they") + ("themself", "they") + ("themselves", "they") + ("theirs", "they") + ("'em", "they") + ("it", "it") + ("itself", "it") + ("its", "it") + ("one", "one") + ("oneself", "one") + ("one's", "one") + ("this", "this") + ("that", "that") + ("these", "these") + ("those", "those") + ("which", "which") + ("who", "who") + ("whom", "who") + ("thy", "thy") + ("y'all", "you") + ("you're", "you") + ("you'll", "you") + ("'s", "'s") + } +} diff --git a/src/main/scala/cc/factorie/app/nlp/coref/StructuredCoreference.scala b/src/main/scala/cc/factorie/app/nlp/coref/StructuredCoreference.scala deleted file mode 100644 index 3d7bd23..0000000 --- a/src/main/scala/cc/factorie/app/nlp/coref/StructuredCoreference.scala +++ /dev/null @@ -1,177 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.app.nlp.coref - -import java.io.DataInputStream -import java.util.concurrent.ExecutorService - -import cc.factorie.app.nlp.phrase.{ParseAndNerBasedPhraseFinder, _} -import cc.factorie.app.nlp.pos.PennPosTag -import cc.factorie.app.nlp.{Document, Sentence} -import cc.factorie.optimize._ -import cc.factorie.util.ClasspathURL -import cc.factorie.variable.LabeledCategoricalVariable - -object NerStructuredCoref extends NerStructuredCoref{ - deserialize(new DataInputStream(ClasspathURL[NerStructuredCoref](".factorie").openConnection().getInputStream)) - options.featureSet = "lexical" -} - -class NerStructuredCoref extends StructuredCoref{ - override def prereqAttrs: Seq[Class[_]] = (ConllPhraseFinder.prereqAttrs ++ AcronymNounPhraseFinder.prereqAttrs++PronounFinder.prereqAttrs ++ NnpPosNounPhraseFinder.prereqAttrs).distinct - override def annotateMentions(doc:Document): Unit = { - (ConllPhraseFinder(doc) ++ PronounFinder(doc) ++ NnpPosNounPhraseFinder(doc)++ AcronymNounPhraseFinder(doc)).distinct.foreach(phrase => doc.getCoref.addMention(phrase)) - doc.coref.mentions.foreach(mention => NounPhraseEntityTypeLabeler.process(mention.phrase)) - doc.coref.mentions.foreach(mention => NounPhraseGenderLabeler.process(mention.phrase)) - doc.coref.mentions.foreach(mention => NounPhraseNumberLabeler.process(mention.phrase)) - } -} - -object ParseStructuredCoref extends ParseStructuredCoref{ - deserialize(new DataInputStream(ClasspathURL[ParseStructuredCoref](".factorie").openConnection().getInputStream)) - options.featureSet = "lexical" -} - -//Uses Parse Based Mention Finding, best for data with nested mentions in the ontonotes annotation style -class ParseStructuredCoref extends StructuredCoref{ - override def prereqAttrs: Seq[Class[_]] = ParseAndNerBasedPhraseFinder.prereqAttrs.toSeq - override def annotateMentions(doc:Document): Unit = { - if(doc.coref.mentions.isEmpty) ParseAndNerBasedPhraseFinder.getPhrases(doc).foreach(doc.coref.addMention) - doc.coref.mentions.foreach(mention => NounPhraseEntityTypeLabeler.process(mention.phrase)) - doc.coref.mentions.foreach(mention => NounPhraseGenderLabeler.process(mention.phrase)) - doc.coref.mentions.foreach(mention => NounPhraseNumberLabeler.process(mention.phrase)) - } -} - -/**The base Structured Coref class uses Gold Mentions, used for evaluation and tests on ConllData*/ -class StructuredCoref extends CorefSystem[MentionGraph]{ - val options = new CorefOptions - val model: StructuredCorefModel = new StructuredCorefModel - override def prereqAttrs: Seq[Class[_]] = Seq(classOf[Sentence],classOf[PennPosTag]) - - def preprocessCorpus(trainDocs:Seq[Document]) = { - val nonPronouns = trainDocs.flatMap(_.targetCoref.mentions.filterNot(m => m.phrase.isPronoun)) - model.CorefTokenFrequencies.counter = new TopTokenFrequencies(nonPronouns,Vector("Head","First","Last","Prec","Follow","Shape","WordForm"),20) - } - - def getCorefStructure(coref:WithinDocCoref) = new MentionGraph(model,coref,options,train=true) - - def instantiateModel(optimizer:GradientOptimizer,pool:ExecutorService) = new SoftMaxParallelTrainer(optimizer,pool) - - class SoftMaxParallelTrainer(optimizer: GradientOptimizer, pool: ExecutorService) extends ParallelTrainer(optimizer,pool){ - def map(d:MentionGraph): Seq[Example] = model.getExample(d) - } - - def infer(coref: WithinDocCoref): WithinDocCoref = { - val instance = new MentionGraph(model,coref,options) - val mentions = instance.orderedMentionList - val scores = model.scoreGraph(instance) - for(i <- 0 until instance.orderedMentionList.length){ - val m1 = mentions(i) - val (bestCandIndx,score) = getBestCandidate(instance, scores(i), i) - val bestCand = mentions(bestCandIndx) - if(bestCand != m1){ - if(bestCand.entity ne null) - bestCand.entity += m1 - else {val entity = coref.newEntity(); entity += m1; entity += bestCand} - } else{val entity = coref.newEntity();entity += m1} - } - coref - } - - def getBestCandidate(mentionGraph: MentionGraph, scores: Array[Double], currMentionIdx: Int): (Int,Double) = { - val antecedentScores = model.normAntecedents(scores) - var bestIdx = -1 - var bestProb = Double.NegativeInfinity - for (anteIdx <- 0 to currMentionIdx) { - val currProb = antecedentScores(anteIdx) - if (bestIdx == -1 || currProb > bestProb) { - bestIdx = anteIdx - bestProb = currProb - } - } - (bestIdx,bestProb) - } -} - -class MentionGraphLabel(model: CorefModel,val currentMention: Int, val linkedMention: Int, val initialValue: Boolean, val lossScore: Double, mentions: Seq[Mention],options: CorefOptions) extends LabeledCategoricalVariable(if (initialValue) "YES" else "NO"){ - def domain = model.MentionPairLabelDomain - val features = new MentionPairFeatures(model,mentions(currentMention),mentions(linkedMention),mentions,options) -} - -class MentionGraph(model: CorefModel, val coref: WithinDocCoref, options: CorefOptions, train: Boolean = false){ - var orderedMentionList = coref.mentions.sortBy(m=>m.phrase.start) - var graph = new Array[Array[MentionGraphLabel]](orderedMentionList.size) - var prunedEdges = new Array[Array[Boolean]](orderedMentionList.size) - - for (i <- 0 until orderedMentionList.size) { - prunedEdges(i) = Array.fill(i+1)(false) - } - - for (currMentionIdx <- 0 until orderedMentionList.size) { - graph(currMentionIdx) = new Array[MentionGraphLabel](currMentionIdx+1) - val currentMention = orderedMentionList(currMentionIdx) - for (anteMentionIdx <- 0 to currMentionIdx) { - val anteMention = orderedMentionList(anteMentionIdx) - //If we don't have a constraint on the pair, then add the linking mention as a possible antecedent - if(!pruneMentionPair(currMentionIdx,anteMentionIdx)){ - val lossScore = if(train) getLossScore(currentMention,anteMention) else 0.0 - var initialValue = false - if(train){ - initialValue = if(currentMention == anteMention){ - //This is ugly but it's a side effect of having singleton clusters during training - currentMention.entity == null || (currentMention.entity != null && currentMention == currentMention.entity.getFirstMention) - } else currentMention.entity != null && anteMention.entity != null && currentMention.entity == anteMention.entity - } - graph(currMentionIdx)(anteMentionIdx) = new MentionGraphLabel(model,currMentionIdx,anteMentionIdx,initialValue,lossScore,orderedMentionList,options) - } else prunedEdges(currMentionIdx)(anteMentionIdx) = true - } - } - - def pruneMentionPair(currMentionIdx:Int, anteMentionIdx:Int):Boolean = { - var skip = false - val currentMention = orderedMentionList(currMentionIdx) - val antecedentMention = orderedMentionList(anteMentionIdx) - val currSentIdx = currentMention.phrase.sentence.indexInSection - val anteSentIdx = antecedentMention.phrase.sentence.indexInSection - val cataphora = antecedentMention.phrase.isPronoun && !currentMention.phrase.isPronoun - val label = currentMention.entity == antecedentMention.entity - if(cataphora){ - if (label && !options.allowPosCataphora || !label && !options.allowNegCataphora) { - skip = true - } - } - if (anteMentionIdx < currMentionIdx - options.maxMentDist || (currentMention.phrase.isPronoun && currSentIdx - anteSentIdx > options.maxPronDist)) { - skip = true - } - if(antecedentMention != currentMention && !antecedentMention.phrase.tokens.intersect(currentMention.phrase.tokens).isEmpty) skip = true - skip - } - - def getLossScore(currMention:Mention, antMention:Mention):Double = { - val headCluster = if(currMention.entity ne null) currMention.entity.getFirstMention else currMention - if (headCluster == currMention && currMention != antMention) { - falseLinkScore - } else if (headCluster != currMention && currMention == antMention) { - falseNewScore - } else if (headCluster != currMention && currMention.entity == antMention.entity) { - wrongLinkScore - } else { - 0.0 - } - } - //Berkeley's Tuned Scores - val falseLinkScore = -0.1 - val falseNewScore = -3.0 - val wrongLinkScore = -1.0 -} diff --git a/src/main/scala/cc/factorie/app/nlp/coref/TokenFreqs.scala b/src/main/scala/cc/factorie/app/nlp/coref/TokenFreqs.scala new file mode 100644 index 0000000..e86971e --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/coref/TokenFreqs.scala @@ -0,0 +1,44 @@ +package cc.factorie.app.nlp.coref + +import cc.factorie.app.nlp.Token + +object TokenFreqs{ + def countWordTypes(nonPronouns: Seq[Mention],specificWordFunc: (Mention) => String, cutoff: Int): DefaultHashMap[String,Int] = { + countAndPrune(nonPronouns.map(specificWordFunc),cutoff) + } + + private def countAndPrune(words: Seq[String], cutoff: Int): DefaultHashMap[String,Int] = { + val counts = new DefaultHashMap[String,Int](0) + words.foreach(key=>counts(key) += 1) + counts.foreach{case (key,value) => if(value < cutoff) counts.remove(key)} + counts + } + + def getTokenAtOffset(token: Token,offset: Int): Token = { val t = token.next(offset); if (t ne null) t else null } + def getTokenStringAtOffset(token: Token,offset: Int): String = { val t = token.next(offset); if (t ne null) t.string else ""} + + def getWordClass(word: Token):String = { + val sb = new StringBuilder + if (word.isCapitalized) { + if (word.containsLowerCase) sb.append("Cap-Mix") + else sb.append("Cap") + } + if (word.isDigits) sb.append("Num") + else if (word.containsDigit) sb.append("Num-Mix") + if (word.string.contains('-')) sb.append("Dash") + if (word.string.contains('s') && word.string.length() >= 3) sb.append("-S") + else if (word.string.length() >= 5){ + val lowerCase = word.string.toLowerCase + if (lowerCase.endsWith("ed")) sb.append("-ed") + else if (lowerCase.endsWith("ing")) sb.append("-ing") + else if (lowerCase.endsWith("ion")) sb.append("-ion") + else if (lowerCase.endsWith("er")) sb.append("-er") + else if (lowerCase.endsWith("est")) sb.append("-est") + else if (lowerCase.endsWith("ly")) sb.append("-ly") + else if (lowerCase.endsWith("ity")) sb.append("-ity") + else if (lowerCase.endsWith("y")) sb.append("-y") + else sb.append("-none") + } + sb.toString() + } +} diff --git a/src/main/scala/cc/factorie/app/nlp/coref/TopTokenFrequencies.scala b/src/main/scala/cc/factorie/app/nlp/coref/TopTokenFrequencies.scala new file mode 100644 index 0000000..a318740 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/coref/TopTokenFrequencies.scala @@ -0,0 +1,29 @@ +package cc.factorie.app.nlp.coref + +import cc.factorie.app.nlp.Token + +class TopTokenFrequencies(val headWords: DefaultHashMap[String,Int], + val firstWords: DefaultHashMap[String,Int] = null, + val lastWords: DefaultHashMap[String,Int] = null, + val precContext: DefaultHashMap[String,Int] = null, + val followContext: DefaultHashMap[String,Int] = null, + val shapes: DefaultHashMap[String,Int] = null, + val wordForm: DefaultHashMap[String,Int] = null, default: Int = 20) { + def this(nonPronouns: Seq[Mention],typesOfCounts: Seq[String], default:Int) = this( + if(typesOfCounts.contains("Head")) TokenFreqs.countWordTypes(nonPronouns,(t) => t.phrase.headToken.string.toLowerCase,default) else null, + if(typesOfCounts.contains("First")) TokenFreqs.countWordTypes(nonPronouns,(t) => t.phrase.tokens(0).string.toLowerCase,default)else null, + if(typesOfCounts.contains("Last")) TokenFreqs.countWordTypes(nonPronouns,(t) => t.phrase.last.string.toLowerCase,default)else null, + if(typesOfCounts.contains("Prec")) TokenFreqs.countWordTypes(nonPronouns,(t) => TokenFreqs.getTokenStringAtOffset(t.phrase.tokens(0),-1).toLowerCase,default)else null, + if(typesOfCounts.contains("Follow")) TokenFreqs.countWordTypes(nonPronouns,(t) => TokenFreqs.getTokenStringAtOffset(t.phrase.last,1).toLowerCase,default)else null, + if(typesOfCounts.contains("Shape")) TokenFreqs.countWordTypes(nonPronouns,(t) => cc.factorie.app.strings.stringShape(t.phrase.string,2),default)else null, + if(typesOfCounts.contains("WordForm")) TokenFreqs.countWordTypes(nonPronouns,(t) => TokenFreqs.getWordClass(t.phrase.headToken),default)else null) + + + //If this token is not a top token, fall back on using pos tag + def containsToken(lexicon: DefaultHashMap[String,Int],token: Token): String = { + if(lexicon.contains(token.string.toLowerCase)) token.string.toLowerCase + else token.posTag.categoryValue + } + + def containsString(lexicon: DefaultHashMap[String,Int],tokenString: String): String = if(lexicon.contains(tokenString)) tokenString else "" +} diff --git a/src/main/scala/cc/factorie/app/nlp/coref/WithinDocCoref.scala b/src/main/scala/cc/factorie/app/nlp/coref/WithinDocCoref.scala new file mode 100644 index 0000000..6479544 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/coref/WithinDocCoref.scala @@ -0,0 +1,124 @@ +package cc.factorie.app.nlp.coref + +import cc.factorie._ +import cc.factorie.app.nlp.phrase.Phrase +import cc.factorie.app.nlp.{Document, Section, Token, TokenSpan} +import cc.factorie.util.EvaluatableClustering +import cc.factorie.variable.Span + + +/** Container for a within-document coreference solution, typically stored as an attr of the Document. + * Some may contain an imperfect inferred coref solution; others may store a gold-standard target coref solution. + * Concrete instances of Mention and WithinDocEntity are created here. + * + * @author Andrew McCallum + */ +class WithinDocCoref(val document:Document) extends EvaluatableClustering[WithinDocEntity,Phrase#Value] { + /** When we have labeled gold-standard truth for coref, it is stored here. */ + var target: WithinDocCoref = null // ...the alternative would have been to create different subclasses of WithinDocCoref so they could be stored separately in the Document.attr, but I chose this as cleaner. -akm + /** A mapping from (the Phrase's span value) to Mention */ + private val _spanToMention = new scala.collection.mutable.LinkedHashMap[Span[Section,Token],Mention] + //private val _phraseToMention = new scala.collection.mutable.LinkedHashMap[Phrase,Mention] // Used to index by this instead. I think we can remove this now. -akm + /** A mapping from entity.uniqueId to WithinDocEntity */ + private val _entities = new scala.collection.mutable.LinkedHashMap[String,WithinDocEntity] + /** A mapping from entity key (i.e. an Int identifying the true entity) to the entity.uniqueId */ + private lazy val _entityKeyToId = new scala.collection.mutable.HashMap[Int,String] + private var _entityCount = 0 // The number of WithinDocEntities ever created here. This number never goes down. + /** A string that will be used as a prefix on the uniqueIds of the Mentions and WithinDocEntities created here. */ + def uniqueId: String = document.uniqueId // TODO Perhaps this should be something more safely unique if we save more than one WithinDocCoref objects per Document? -akm + def uniqueIdEntitySuffix(entityIndex:Int): String = "//WithinDocEntity" + entityIndex + def uniqueIdMentionSuffix(phraseStart:Int, phraseLength:Int): String = "//Mention(" + phraseStart + "," + phraseLength + ")" + /** Concrete implementation of WithinDocEntity that automatically stores itself in WithinDocCoref.entities. */ + protected class WithinDocEntity1(val uniqueId:String) extends WithinDocEntity(document) { + def this() = this(WithinDocCoref.this.uniqueId + uniqueIdEntitySuffix(_entityCount)) // TODO Is this what we want? -akm + _entityCount += 1 + assert(!_entities.contains(uniqueId)) + _entities(uniqueId) = this + def coref: WithinDocCoref = WithinDocCoref.this + } + /** Concrete implementation of Mention that automatically stores itself in WithinDocCoref.mentions. */ + protected class Mention1(phrase:Phrase, entity:WithinDocEntity) extends Mention(phrase) { + def this(phrase:Phrase, entityKey:Int) = this(phrase, entityFromKey(entityKey)) // Typically used for labeled data + def this(phrase:Phrase, entityUniqueId:String) = this(phrase, entityFromUniqueId(entityUniqueId)) // Typically used for deserialization + def this(phrase:Phrase) = this(phrase, null.asInstanceOf[WithinDocEntity]) // Typically used for new inference // TODO Should this be null, or a newly created blank Entity; See LoadConll2011 also. + assert(entity == null || entity.asInstanceOf[WithinDocEntity1].coref == WithinDocCoref.this) + _spanToMention(phrase.value) = this + val uniqueId = WithinDocCoref.this.uniqueId + uniqueIdMentionSuffix(phrase.start, phrase.length) // TODO Is this what we want? -akm + if (entity ne null) entity += this + def coref: WithinDocCoref = WithinDocCoref.this + } + + /** Given Span (typically the value of a Phrase), return the corresponding Mention. + Note that Span is a case class, so the lookup is done by the span's boundaries, not by its identity. */ + def mention(span:Span[Section,Token]): Option[Mention] = _spanToMention.get(span) + /** Return the Mention corresponding to the given Phrase. If none present, return null. + Note that since the lookup happens by the Phrase's Span value, the returned mention.phrase may be different than this method's argument. */ + def mention(phrase:Phrase): Option[Mention] = _spanToMention.get(phrase.value) + + /** Create a new Mention whose entity will be null. */ + def addMention(phrase:Phrase): Mention = _spanToMention.getOrElse(phrase.value, new Mention1(phrase)) + /** Create a new Mention with entity specified by given uniqueId. */ + def addMention(phrase:Phrase, entityId:String): Mention = { assert(!_spanToMention.contains(phrase.value)); new Mention1(phrase, entityId) } + /** Create a new Mention with entity specified by given key. */ + def addMention(phrase:Phrase, entityKey:Int): Mention = { assert(!_spanToMention.contains(phrase.value)); new Mention1(phrase, entityKey) } + /** Create a new Mention with the given entity, which must also be in this WithinDocCoref */ + def addMention(phrase:Phrase, entity:WithinDocEntity): Mention = new Mention1(phrase, entity) + + /** Remove a Mention from this coreference solution, and from its entity if it has one. */ + def deleteMention(mention:Mention): Unit = { + if (mention.entity ne null) mention.entity -= mention + _spanToMention.remove(mention.phrase.value) + } + + /** Checks whether the given tokenspan overlaps with an existing mention, returns the overlapping mention if it does. */ + def findOverlapping(tokenSpan:TokenSpan):Option[Mention] = tokenSpan match { + case ts if ts.document == this.document => mentions.find(_.phrase.characterOffsets overlapsWith ts.characterOffsets) + case _ => None + } + + /** Return all Mentions in this coreference solution. */ + def mentions: Seq[Mention] = _spanToMention.values.toVector + /** Return a collection of WithinDocEntities managed by this coref solution. Note that some of them may have no Mentions. */ + def entities: Iterable[WithinDocEntity] = _entities.values + /** Create and return a new WithinDocEntity with uniqueId determined by the number entities created so far. */ + def newEntity(): WithinDocEntity = new WithinDocEntity1() + /** Return the entity associated with the given uniqueId, or create a new entity if not found already among 'entities'. */ + def entityFromUniqueId(id:String): WithinDocEntity = _entities.getOrElse(id, new WithinDocEntity1(id)) + /** Return the entity associated with the given key, or create a new entity if not found already among 'entities'. */ + def entityFromKey(key:Int): WithinDocEntity = { + val id = _entityKeyToId.getOrElse(key,null) + val result = if (id eq null) new WithinDocEntity1 else _entities(id) + _entityKeyToId(key) = result.uniqueId + result + } + /** Return the entity associated with the given uniqueId. Return null if not found. */ + def idToEntity(id:String): WithinDocEntity = _entities(id) + /** Remove from the list of entities all entities that contain no mentions. */ + def trimEmptyEntities(): Unit = _entities.values.filter(_.mentions.size == 0).map(_.uniqueId).foreach(_entities.remove) // TODO But note that this doesn't purge _entityKeyToId; perhaps it should. + /** Remove from all entities and mentions associated with entities that contain only one mention. */ + def removeSingletons():Unit ={ + _entities.values.filter(_.mentions.size == 1).map(_.uniqueId).foreach{ + id => + _entities(id).mentions.foreach(m => deleteMention(m)) + _entities.remove(id) + } + } + + /**Reset the clustered entities for this coref solution without losing mentions and their cached properties*/ + def resetPredictedMapping():Unit = {_entities.clear();mentions.foreach(_._setEntity(null));_entityCount = 0 } + + // Support for evaluation + // These assure we ignore any singletons for conll scoring + // TODO: Allow for ACE scoring where singletons are counted + def clusterIds: Iterable[WithinDocEntity] = _entities.values.filterNot(_.isSingleton) + def pointIds: Iterable[Phrase#Value] = _spanToMention.values.filterNot(m => m.entity == null || m.entity.isSingleton).map(_.phrase.value) + def pointIds(entityId:WithinDocEntity): Iterable[Phrase#Value] = if(!entityId.isSingleton) entityId.mentions.map(_.phrase.value) else Seq() + def intersectionSize(entityId1:WithinDocEntity, entityId2:WithinDocEntity): Int = if(!entityId1.isSingleton && !entityId2.isSingleton) entityId1.mentions.map(_.phrase.value).intersect(entityId2.mentions.map(_.phrase.value)).size else 0 + def clusterId(mentionId:Phrase#Value): WithinDocEntity = { + val mention = _spanToMention.getOrElse(mentionId,null) + if(mention == null || mention.entity == null ||mention.entity.isSingleton) null + else mention.entity + } + + +} \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/nlp/coref/WithinDocEntity.scala b/src/main/scala/cc/factorie/app/nlp/coref/WithinDocEntity.scala new file mode 100644 index 0000000..c2198e8 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/coref/WithinDocEntity.scala @@ -0,0 +1,53 @@ +package cc.factorie.app.nlp.coref + +import cc.factorie.app.nlp.Document +import cc.factorie.app.nlp.phrase.{NounPhraseType, NounPhraseTypeDomain} +import cc.factorie.app.nlp.pos.PennPosDomain + +/** An entity whose evidence comes from some Phrases within a single document. + * Users should not create these themselves, but rather use WithinDocCoref create them. + * The uniqueId is abstract. + * + * @author Andrew McCallum */ +abstract class WithinDocEntity(val document:Document) extends AbstractEntity { + type ParentType = WithinDocEntity + private val _mentions = new scala.collection.mutable.LinkedHashSet[Mention] + def parent: WithinDocEntity = null + def mentions:scala.collection.Set[Mention] = _mentions + def isSingleton:Boolean = _mentions.size == 1 + def isEmpty:Boolean = _mentions.isEmpty + def children: Iterable[Mention] = _mentions + // TODO Rename this to remove the "get". + def getFirstMention: Mention = if(isEmpty) null else if(isSingleton) _mentions.head else mentions.minBy(m => m.phrase.start) + def +=(mention:Mention): Unit = { + assert(mention.phrase.document eq document) + //assert(!_mentions.contains(mention)) // No reason to do this; might catch a bug. + if (mention.entity ne null) mention.entity._mentions -= mention + if(!_mentions.contains(mention))_mentions += mention + mention._setEntity(WithinDocEntity.this) + } + def -=(mention:Mention): Unit = { + assert(mention.phrase.document eq document) + assert(_mentions.contains(mention)) // No reason to do this; might catch a bug. + assert(mention.entity == this) + _mentions -= mention + mention._setEntity(null) + } + + /** Return the canonical mention for the entity cluster. If the canonical mention is not already set it computes, sets, and returns the canonical mention */ + def getCanonicalMention: Mention = { + if (canonicalMention eq null) { + val canonicalOption = _mentions.filter{m => + (m.phrase.attr[NounPhraseType].value == NounPhraseTypeDomain.value("NOM") || + m.phrase.attr[NounPhraseType].value == NounPhraseTypeDomain.value("NAM")) && + m.phrase.last.posTag.intValue != PennPosDomain.posIndex + }.toSeq.sortBy(m => (m.phrase.start, m.phrase.length)).headOption + canonicalMention = canonicalOption.getOrElse(children.headOption.orNull) + canonicalName = canonicalMention.string + } + canonicalMention + } + var canonicalName: String = null + var canonicalMention: Mention = null + // If number, gender and entity type are needed, put a CategoricalVariable subclass in the Attr +} diff --git a/src/main/scala/cc/factorie/app/nlp/embedding/Browse.scala b/src/main/scala/cc/factorie/app/nlp/embedding/Browse.scala deleted file mode 100644 index f2cba89..0000000 --- a/src/main/scala/cc/factorie/app/nlp/embedding/Browse.scala +++ /dev/null @@ -1,114 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -/* package cc.factorie.app.nlp.embedding -import cc.factorie.la._ -import cc.factorie.maths - -import scala.collection.mutable.LinkedHashMap -*/ -/* -object Browse { - - // t1 is the anchor (with some zero dimensions ignored), t2 is the data (which may have fewer zeros) - var zeroThreshold = 0.01 - def asymmetricDotSimilarity(t1:DenseTensor1, t2:DenseTensor1): Double = { - val f1 = t1.map(x => if (math.abs(x) < zeroThreshold) 0.0 else x) - //val f2 = t2.map(x => if (math.abs(x) < zeroThreshold) 0.0 else x) - //new DenseTensor1(f1) cosineSimilarity new DenseTensor1(f2) - new DenseTensor1(f1) cosineSimilarity new DenseTensor1(t2) - } - def asymmetricDotSimilarity2(t1:DenseTensor1, t2:DenseTensor1): Double = { - val f1 = t1.map(x => if (math.abs(x) < zeroThreshold) 0.0 else x) - val f2 = t2.map(x => if (math.abs(x) < zeroThreshold) 0.0 else x) - new DenseTensor1(f1) cosineSimilarity new DenseTensor1(f2) - } - - def main(args:Array[String]): Unit = { - val embeddings = new LinkedHashMap[String,DenseTensor1] - println("Reading embeddings...") - var dim = -1 - var lineNum = 0 - for (line <- io.Source.fromFile(args(0)).getLines()) { - val elts = line.split("\\s+") - val t = new DenseTensor1(elts.drop(1).map(_.toDouble)) - // t.twoNormalize() // TODO Make this a command-line option - embeddings(elts.head) = t - if (dim > 0) { if (t.length != dim) println(s"At line $lineNum expected length $dim but got ${t.length}") } - else dim = t.length - lineNum += 1 - } - println(s"...Read $dim-dimensional embeddings for ${embeddings.size} words.") - - val prompt = "> " - print(prompt); System.out.flush() - val cosSimilarity: (DenseTensor1,DenseTensor1)=>Double = (t1,t2) => t1.cosineSimilarity(t2) // cosine similarity - val dotSimilarity: (DenseTensor1,DenseTensor1)=>Double = (t1,t2) => t1.dot(t2) // dot product - val sigSimilarity: (DenseTensor1,DenseTensor1)=>Double = (t1,t2) => maths.sigmoid(t1.dot(t2)) // sigmoid of dot product - val maskedDotSimilarity: (DenseTensor1,DenseTensor1)=>Double = (t1,t2) => asymmetricDotSimilarity(t1, t2) // - val maskedDotSimilarity2: (DenseTensor1,DenseTensor1)=>Double = (t1,t2) => asymmetricDotSimilarity2(t1, t2) // - val euclideanSimilarity: (DenseTensor1,DenseTensor1)=>Double = (t1,t2) => 1.0 / t1.euclideanDistance(t2) // inverse of Euclidean distance - var count = 10 - for (line <- io.Source.stdin.getLines()) { - //val query = embeddings.getOrElse(line.stripLineEnd, null) - //val query = line.split("\\s+").map(word => embeddings.getOrElse(word, null)).filter(_ eq null).foldLeft(new DenseTensor1(dim))((a,b) => {b += a; b}) - val query = new DenseTensor1(dim) - val queryWords = line.split("\\s+") - var similarity: (DenseTensor1,DenseTensor1)=>Double = (t1,t2) => { - if (t1.length != t2.length) println(s"embedding.Browse t1=${t1.length} t2=${t2.length}") - t1.cosineSimilarity(t2) - } - var operation = 1 // 1 for addition, -1 for subtraction - for (word <- queryWords) { - if (word.matches("\\d+")) count = word.toInt - else if (word == "-") operation = -1 - else if (word == "+") operation = 1 - else if (word == "cos:") similarity = cosSimilarity - else if (word == "dot:") similarity = dotSimilarity - else if (word == "sig:") similarity = sigSimilarity - else if (word == "asy:") similarity = asymmetricDotSimilarity - else if (word == "asy2:") similarity = asymmetricDotSimilarity2 - else if (word == "euc:") similarity = euclideanSimilarity - else if (word.matches("thresh=[\\d\\.]+")) zeroThreshold = word.split("=")(1).toDouble - else if (word == "zero:") query.zero() - else if (word.startsWith("[")) { // Expecting [2] for one-hot at dimension 2 - val oneHot = new DenseTensor1(dim) - val i = word.drop(1).dropRight(1).toInt - oneHot(i) = 1.0 - if (operation == 1) query += oneHot else query -= oneHot - } else { - val embedding = embeddings.getOrElse(word, null) - if (embedding eq null) println(s"'$word' is outside vocabulary.") - else { - if (operation == 1) query += embedding else query -= embedding - } - } - } - if (query.oneNorm != 0.0) { - println("QUERY: "+line) - val top = new cc.factorie.util.TopN[String](count) - for (tuple <- embeddings) top += (0, similarity(query, tuple._2), tuple._1) - for (entry <- top) - println(f"${entry.category}%-25s ${entry.score}%3.8f "+ - f"2norm=${embeddings(entry.category).twoNorm}%f3.5 "+ - f"min=${embeddings(entry.category).min}%f3.3 "+ - f"max=${embeddings(entry.category).max}%f3.3 "+ - f"absmin=${embeddings(entry.category).map(math.abs(_)).min}%f3.3 "+ - f"<${zeroThreshold}%1.1gcount=${embeddings(entry.category).filter(x => math.abs(x) < zeroThreshold).size}%d " - ) - println() - } - print(prompt); System.out.flush() - } - } -} -*/ \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/nlp/embedding/CBOW.scala b/src/main/scala/cc/factorie/app/nlp/embedding/CBOW.scala deleted file mode 100644 index 8afa51e..0000000 --- a/src/main/scala/cc/factorie/app/nlp/embedding/CBOW.scala +++ /dev/null @@ -1,200 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -/* -package cc.factorie.app.nlp.embedding -import java.io._ - -import cc.factorie.la._ -import cc.factorie.model._ -import cc.factorie.util.{DoubleAccumulator, IntArrayBuffer} - -import scala.collection.mutable.ArrayBuffer - -class CBOWOptions extends WindowWordEmbedderOptions with IncrementalVocabularyOptions { - val margin = new CmdOption("margin", 0.1, "DOUBLE", "Margin for WSABIE training.") - val loss = new CmdOption("loss", "wsabie", "STRING", "Loss function; options are wsabie and log.") - val browse = new CmdOption("browse", false, "true|false", "If true provide prompt for interatively browsing input embeddings.") -} - -object CBOWExample { - def apply(model:CBOW, wordIndices:Array[Int], centerPosition:Int, window:Int): Option[CBOWExample] = { - val targetId = wordIndices(centerPosition) - if (model.discard(targetId)) { // Skip some of the most common target words - //println("CBOWExample skipping "+model.domain.category(targetId)) - return None - } - val context = new IntArrayBuffer(window*2) - var i = math.max(centerPosition - window, 0) - val end = math.min(centerPosition + window, wordIndices.length) - while (i < end) { - val wi = wordIndices(i) - // Next line sometimes discards frequent words from context - if (i != centerPosition && !model.discard(wi)) context += wi - i += 1 - } - if (context.length < model.opts.minContext.value) return None - val result = model.opts.loss.value match { - case "log" => new LogCBOWExample(model, targetId, context.asArray) - case "wsabie" => new WsabieCBOWExample(model, targetId, context.asArray) - case unk => throw new Error("Unknown loss "+unk) - } - if (false && model.random.nextDouble() < 0.005) { - val start = math.max(centerPosition - window, 0) - println(s"CBOWExample raw ${Range(start, end).map(i => model.domain.category(wordIndices(i))).mkString(" ")}") - println(s"CBOWExample ${model.domain.category(targetId)} ${Range(0, context.length).map(i => model.domain.category(context(i))).mkString(" ")}") - } - Some(result) - } -} - -trait CBOWExample extends WindowWordEmbedderExample { - def targetId: Int -} - -class LogCBOWExample(val model:CBOW, val targetId:Int, val inputIndices:Array[Int]) extends CBOWExample { - val changedWeights = new ArrayBuffer[Weights] - def outputIndices: Array[Int] = Array(targetId) - val samples = model.makeNegativeSamples // Do this once up front so that Example.testGradient will work - def accumulateValueAndGradient(value: DoubleAccumulator, gradient: WeightsMapAccumulator): Unit = { - var targetEmbedding = model.outputEmbedding(targetId) - val contextEmbedding = new DenseTensor1(model.dims) - val len = inputIndices.length - var i = 0; while (i < len) { contextEmbedding += model.inputEmbedding(inputIndices(i)); i += 1 } - if (model.opts.normalizeX.value) - contextEmbedding *= (1.0 / len) - //for (i <- start until start+length) contextEmbedding += model.embedding(context(i)) - // Positive case - var score = targetEmbedding dot contextEmbedding - var expScore = math.exp(-score) - // FIXME this log1p is actually really slow and we don't use it for anything! -// if (value ne null) value.accumulate(-math.log1p(expScore)) - if (gradient ne null) { - val stepSize = expScore/(1.0 + expScore) - gradient.accumulate(model.outputWeights(targetId), contextEmbedding, stepSize) - changedWeights += model.outputWeights(targetId) - i = 0; while (i < len) { gradient.accumulate(model.inputWeights(inputIndices(i)), targetEmbedding, stepSize); changedWeights += model.inputWeights(inputIndices(i)); i += 1 } - } - // Negative case - for (n <- 0 until model.opts.negative.value) { - val falseTarget = samples(n) - targetEmbedding = model.outputEmbedding(falseTarget) - score = targetEmbedding dot contextEmbedding - expScore = math.exp(-score) - // FIXME this log1p is actually really slow and we don't use it for anything! -// if (value ne null) value.accumulate(-score - math.log1p(expScore)) - if (gradient ne null) { - val stepSize = -1.0 / (1.0 + expScore) - gradient.accumulate(model.outputWeights(falseTarget), contextEmbedding, stepSize) - changedWeights += model.outputWeights(falseTarget) - i = 0; while (i < len) { gradient.accumulate(model.inputWeights(inputIndices(i)), targetEmbedding, stepSize); i += 1 } - } - } - } -} - -class WsabieCBOWExample(val model:CBOW, val targetId:Int, val inputIndices:Array[Int]) extends CBOWExample { - val changedWeights = new ArrayBuffer[Weights] - def outputIndices: Array[Int] = Array(targetId) - val samples = model.makeNegativeSamples // Do this once up front so that Example.testGradient will work - def accumulateValueAndGradient(value: DoubleAccumulator, gradient: WeightsMapAccumulator): Unit = { - val contextEmbedding = new DenseTensor1(model.dims) - val len = inputIndices.length - var i = 0; while (i < len) { contextEmbedding += model.inputEmbedding(inputIndices(i)); i += 1 } - // TODO FIX this is weird since normalizeX should average the contexts, not project things - // Also this should only project onto ball, not surface of sphere since nonconvex -luke - val inputNormalizer = if (model.opts.normalizeX.value) 1.0 / math.sqrt(len) else 1.0 // TODO Should we have this normalization? In my quick eyeing of results, it looks worse with normalization than without. - if (inputNormalizer != 1.0) contextEmbedding *= inputNormalizer // Normalize the input embedding - // Positive case - val trueTargetEmbedding = model.outputEmbedding(targetId) - val trueScore = trueTargetEmbedding dot contextEmbedding - // Negative cases - for (s <- samples) { - val falseTargetId = s - val falseTargetEmbedding = model.outputEmbedding(falseTargetId) - val falseScore = falseTargetEmbedding dot contextEmbedding - val objective = trueScore - falseScore - model.opts.margin.value - if (objective < 0.0) { - if (value ne null) value.accumulate(objective) - if (gradient ne null) { - gradient.accumulate(model.outputWeights(targetId), contextEmbedding, inputNormalizer) //; touchedWeights += model.outputWeights(targetId) - gradient.accumulate(model.outputWeights(falseTargetId), contextEmbedding, -inputNormalizer) //; touchedWeights += model.outputWeights(falseTargetId) - val trueFalseEmbeddingDiff = trueTargetEmbedding - falseTargetEmbedding - i = 0; while (i < len) { - gradient.accumulate(model.inputWeights(inputIndices(i)), trueFalseEmbeddingDiff, inputNormalizer); changedWeights += model.inputWeights(inputIndices(i)) - i += 1 - } - } - } - } - } -} - -class CBOW(override val opts:CBOWOptions) extends WordEmbedder(opts) { - def newExample(model:WordEmbedder, wordIndices:Array[Int], centerPosition:Int, window:Int): Option[CBOWExample] = CBOWExample(model.asInstanceOf[CBOW], wordIndices, centerPosition, window) -} -*/ - - -/** A command-line interface to CBOW word embedding training and browsing. - - Examples: - - CBOW --vocabulary vocabulary.txt --vocab-min-count 200 --vocab-input enwiki-latest-pages-articles.xml.bz2 - Do not train embeddings, but build a vocabulary of all words occurring 200 times or more in all of Wikipedia. - Store the results (vocabulary, and the number of occurrences of each word type) in the file "vocabulary.txt" - - CBOW --dims 100 --vocabulary vocabulary.txt --train-input enwiki-latest-pages-articles.xml.bz2 --max-documents 1000000 --parameters-output parameters.gz - Train on the first first 1 million articles of Wikipedia, using embedding vectors of size 100. - Use the vocabulary and counts from the file "vocabulary.txt"; skip all words not in this vocabulary. - The same embedding parameters will be used for both context (input) and target (output). - During training, every 10 million word training windows, write the input embeddings (in a simple textual format) to filenames such as "embeddings-0010m". - After training save input and output embeddings (in a compressed binary format) to the filename "parameters.gz". - - CBOW --dims 100 --vocabulary vocabulary.txt --parameters-load parameters.gz --browse - Load previously trained parameters from the file "parameters.gz". - (It is your responsibility to specify the corresponding --vocabulary and --dims values that were used when these parameters were trained.) - Then provide an interact prompt for browsing embeddings. - At the prompt you can enter a word, and the list of words with nearest embeddings will be printed to the screen. - - CBOW --dims 100 --incremental-vocab-max-size 500000 --incremental-vocab-min-count 50 --vocabulary vocabulary.txt --train-input enwiki-latest-pages-articles.xml.bz2 --max-documents 10000000 - Train on the first 1 million artcles of Wikipedia using embedding vectors of size 100, but do not build the vocabulary in advance. - Instead incrementally build the vocabulary during training: - count new words; as soon as their count is >= 50, add them to the vocabulary give them an embedding and start learning the embedding on future words. - After training, write the vocabulary to the file "vocabulary.txt". - - */ -/* -object CBOW { - - def main(args:Array[String]): Unit = { - val opts = new CBOWOptions - opts.parse(args) - val cbow = if (opts.incrementalVocabMaxSize.wasInvoked) new CBOW(opts) with IncrementalVocabulary else new CBOW(opts) - if (opts.trainInput.wasInvoked) { - cbow.train(opts.trainInput.value.map(new File(_))) - cbow.writeInputEmbeddings("embeddings.txt") - if (opts.incrementalVocabMaxSize.wasInvoked) cbow.writeVocabulary(opts.vocabulary.value) - } else if (opts.vocabInput.wasInvoked) { - cbow.buildVocabulary(opts.vocabInput.value) - } else if (opts.browse.wasInvoked) { - cbow.browse() - } else { - println("Either option --train-input or --vocab-input or --browse is required.") - System.exit(-1) - } - println("CBOW.main done.") - } - -} - -*/ \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/nlp/embedding/SkipGram.scala b/src/main/scala/cc/factorie/app/nlp/embedding/SkipGram.scala deleted file mode 100644 index bdcf496..0000000 --- a/src/main/scala/cc/factorie/app/nlp/embedding/SkipGram.scala +++ /dev/null @@ -1,183 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -/* -package cc.factorie.app.nlp.embedding - -import java.io.File - -import cc.factorie.la.{DenseTensor1, WeightsMapAccumulator} -import cc.factorie.model.Weights -import cc.factorie.util.{DoubleAccumulator, IntArrayBuffer} - -import scala.collection.mutable.ArrayBuffer -*/ -/** - * Created by asubbaswamy on 8/17/15. - */ -/* -class SkipGramOptions extends WindowWordEmbedderOptions with IncrementalVocabularyOptions { - val margin = new CmdOption("margin", 0.1, "DOUBLE", "Margin for WSABIE training.") - val loss = new CmdOption("loss", "wsabie", "STRING", "Loss function; options are wsabie and log.") - val browse = new CmdOption("browse", false, "true|false", "If true provide prompt for interatively browsing input embeddings.") -} - -object SkipGramExample { - def apply(model:SkipGram, wordIndices:Array[Int], centerPosition:Int, window:Int): Option[SkipGramExample] = { - val targetId = wordIndices(centerPosition) - if (model.discard(targetId)) { // Skip some of the most common target words - //println("SkipGramExample skipping "+model.domain.category(targetId)) - return None - } - val context = new IntArrayBuffer(window*2) - var i = math.max(centerPosition - window, 0) - val end = math.min(centerPosition + window, wordIndices.length) - while (i < end) { - val wi = wordIndices(i) - // Next line sometimes discards frequent words from context - if (i != centerPosition && !model.discard(wi)) context += wi - i += 1 - } - if (context.length < model.opts.minContext.value) return None - val result = model.opts.loss.value match { - case "log" => new LogSkipGramExample(model, targetId, context.asArray) - case "wsabie" => new WsabieSkipGramExample(model, targetId, context.asArray) - case unk => throw new Error("Unknown loss "+unk) - } - if (false && model.random.nextDouble() < 0.005) { - val start = math.max(centerPosition - window, 0) - println(s"SkipGramExample raw ${Range(start, end).map(i => model.domain.category(wordIndices(i))).mkString(" ")}") - println(s"SkipGramExample ${model.domain.category(targetId)} ${Range(0, context.length).map(i => model.domain.category(context(i))).mkString(" ")}") - } - Some(result) - } -} - -trait SkipGramExample extends WindowWordEmbedderExample { - def targetId: Int -} - -class LogSkipGramExample(val model:SkipGram, val targetId:Int, val outputIndices:Array[Int]) extends SkipGramExample { - val changedWeights = new ArrayBuffer[Weights] - def inputIndices: Array[Int] = Array(targetId) - val samples = model.makeNegativeSamples // Do this once up front so that Example.testGradient will work - - def accumulateValueAndGradient(value: DoubleAccumulator, gradient: WeightsMapAccumulator): Unit = { - var i = 0 - val len = outputIndices.length - while (i < len) { //for every word in the context... - val index = outputIndices(i) - var targetEmbedding = model.inputEmbedding(targetId) //in skip-gram the "target" is the input - val contextEmbedding = new DenseTensor1(model.dims) - contextEmbedding += model.outputEmbedding(index) - - if (model.opts.normalizeX.value) contextEmbedding *= (1.0 / len) - - var score = targetEmbedding dot contextEmbedding - var expScore = math.exp(-score) - - if (gradient ne null) { - //positive example - var stepSize = expScore/(1.0 + expScore) - gradient.accumulate(model.inputWeights(targetId), contextEmbedding, stepSize) - gradient.accumulate(model.outputWeights(index), targetEmbedding, stepSize) - - //negative examples - for (n <- 0 until model.opts.negative.value) { - val falseTarget = samples(n) - targetEmbedding = model.inputEmbedding(falseTarget) - score = targetEmbedding dot contextEmbedding - expScore = math.exp(-score) - stepSize = -1.0 / (1.0 + expScore) - gradient.accumulate(model.inputWeights(falseTarget), contextEmbedding, stepSize) - gradient.accumulate(model.outputWeights(index), targetEmbedding, stepSize) - } - } - i += 1 - } - } -} - -class WsabieSkipGramExample(val model:SkipGram, val targetId:Int, val outputIndices:Array[Int]) extends SkipGramExample { - val changedWeights = new ArrayBuffer[Weights] - def inputIndices: Array[Int] = Array(targetId) - val samples = model.makeNegativeSamples // Do this once up front so that Example.testGradient will work - - def accumulateValueAndGradient(value: DoubleAccumulator, gradient: WeightsMapAccumulator): Unit = { - var i = 0 - val len = outputIndices.length - - while (i < len) { //for each context index - val index = outputIndices(i) - val trueTargetEmbedding = model.inputEmbedding(targetId) - val contextEmbedding = new DenseTensor1(model.dims) - contextEmbedding += model.outputEmbedding(index) - - val inputNormalizer = if (model.opts.normalizeX.value) 1.0 / math.sqrt(len) else 1.0 - if (inputNormalizer != 1.0) contextEmbedding *= inputNormalizer // Normalize the input embedding - - val trueScore = trueTargetEmbedding dot contextEmbedding - - for (s <- samples) { - val falseTargetId = s - val falseTargetEmbedding = model.inputEmbedding(falseTargetId) - val falseScore = falseTargetEmbedding dot contextEmbedding - - val objective = trueScore - falseScore - model.opts.margin.value - - if (objective < 0.0) { - if (value ne null) value.accumulate(objective) - if (gradient ne null) { - gradient.accumulate(model.inputWeights(targetId), contextEmbedding, inputNormalizer) - gradient.accumulate(model.inputWeights(falseTargetId), contextEmbedding, -inputNormalizer) - - val trueFalseEmbeddingDiff = trueTargetEmbedding - falseTargetEmbedding - gradient.accumulate(model.outputWeights(index), trueFalseEmbeddingDiff, inputNormalizer) - } - } - } - - i += 1 - } - } -} -*/ -/* -class SkipGram(override val opts:SkipGramOptions) extends WordEmbedder(opts) { - def newExample(model:WordEmbedder, wordIndices:Array[Int], centerPosition:Int, window:Int): Option[SkipGramExample] = SkipGramExample(model.asInstanceOf[SkipGram], wordIndices, centerPosition, window) -} - -object SkipGram { - def main(args: Array[String]) { - val opts = new SkipGramOptions - opts.parse(args) - val skipgram = if (opts.incrementalVocabMaxSize.wasInvoked) new SkipGram(opts) with IncrementalVocabulary else new SkipGram(opts) - if (opts.trainInput.wasInvoked) { - skipgram.train(opts.trainInput.value.map(new File(_))) - if (opts.separateIO.value) { - skipgram.writeInputEmbeddings("input_embeddings.txt") - skipgram.writeOutputEmbeddings("output_embeddings.txt") - } - else skipgram.writeInputEmbeddings("embeddings.txt") - if (opts.incrementalVocabMaxSize.wasInvoked) skipgram.writeVocabulary(opts.vocabulary.value) - } else if (opts.vocabInput.wasInvoked) { - skipgram.buildVocabulary(opts.vocabInput.value) - } else if (opts.browse.wasInvoked) { - skipgram.browse() - } else { - println("Either option --train-input or --vocab-input or --browse is required.") - System.exit(-1) - } - println("SkipGram.main done.") - } -} -*/ \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/nlp/embedding/WindowWordEmbedder.scala b/src/main/scala/cc/factorie/app/nlp/embedding/WindowWordEmbedder.scala deleted file mode 100644 index 073f86f..0000000 --- a/src/main/scala/cc/factorie/app/nlp/embedding/WindowWordEmbedder.scala +++ /dev/null @@ -1,536 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -/* -package cc.factorie.app.nlp.embedding -import java.io._ -import java.text.NumberFormat -import java.util.Locale -import java.util.zip.{GZIPInputStream, GZIPOutputStream} - -import cc.factorie.app.strings.alphaSegmenter -import cc.factorie.la._ -import cc.factorie.model._ -import cc.factorie.optimize._ -import cc.factorie.variable.CategoricalDomain -import org.apache.commons.compress.compressors.CompressorStreamFactory - -import scala.collection.mutable.ArrayBuffer -import scala.io.Source -import scala.util.Random -import scala.xml.pull._ - -class WindowWordEmbedderOptions extends cc.factorie.util.DefaultCmdOptions { - val vocabInput = new CmdOption("vocab-input", List("enwiki-latest-pages-articles.xml.bz2"), "TXTFILE", "Text files from which to read documents and words for building the vocabulary. Works with *.txt.gz, Wikipedia enwiki*.xmlgz2, and a few other formats.") - val trainInput = new CmdOption("train-input", List("enwiki-latest-pages-articles.xml.bz2"), "TXTFILE", "Text files from which to read documents and words for training the embeddings. Works with *.txt.gz, Wikipedia enwiki*.xmlgz2, and a few other formats.") - val parametersSave = new CmdOption("parameters-save", "parameters.gz", "FILE", "If invoked, save the parameters after training to this filename in compressed binary format.") - val parametersLoad = new CmdOption("parameters-load", "parameters.gz", "FILE", "If invoked, load the parameters at initialization time from this filename containing compressed binary embedding parameters.") - val dims = new CmdOption("dims", 50, "INT", "Dimensionality of the embedding vectors.") - val seed = new CmdOption("seed", 0, "INT", "Seed for random number generator.") - val minContext = new CmdOption("min-context", 2, "INT", "Skip training windows that end up with fewer context words after randomized removal of common context words.") - val window = new CmdOption("window", 5, "INT", "The number of words on either side of the target to include in the context window.") - val normalizeX = new CmdOption("normalize-x", false, "BOOL", "If true, normalize input context by the number of context words in the training example.") - val vocabulary = new CmdOption("vocabulary", "vocabulary.txt", "FILE", "Filename from which to initialize or save the collection of all word types, one per line, each preceded by its count.") - val negative = new CmdOption("negative", 3, "INT", "The number of NCE negative examples to use for each positive example.") - val maxDocuments = new CmdOption("max-documents", Long.MaxValue, "LONG", "Read no more than this number of documents or Wikipedia pages. Default is Int.MaxValue.") - val separateIO = new CmdOption("separate-io", false, "BOOLEAN", "If TRUE, parameterize input embeddings (U) separately from output embeddings (V). Default is FALSE.") - val checkGradient = new CmdOption("check-gradient", false, "BOOLEAN", "If TRUE, test the value/gradient calculation for every parameter for every example after the first 50000 example. (Slow.) Default is FALSE.") - val outputExamples = new CmdOption("output-examples", "examples.txt.gz", "FILE", "Save the training targets/contexts in this file, one per line.") - val useAliasSampling = new CmdOption("alias-sampling", false, "BOOLEAN", "Sample negative examples using alias sampling vs. power-law approximation.") - val powerForCounts = new CmdOption("power-for-counts", 0.75, "DOUBLE", "Power to raise counts to when computing proportions for exact alias sampling.") - - val vocabMinCount = new CmdOption("vocab-min-count", 200, "INT", "Words with count smaller than this will be discarded when building the vocabulary. Default is 200.") - val vocabSkipProb = new CmdOption("vocab-skip-prob", 0.0, "DOUBLE", "The probabilty that each word string will be skipped in the indexing and counting when building the vocabulary. Helps efficiently cull words occurring ~1 times.") -} - -trait WindowWordEmbedderExample extends Example { - def inputIndices: Array[Int] - def outputIndices: Array[Int] - def changedWeights: ArrayBuffer[Weights] -} - -abstract class WordEmbedder(val opts:WindowWordEmbedderOptions) extends Parameters { - val dims = opts.dims.value - val random = new Random(opts.seed.value) - val domain = new CategoricalDomain[String]; domain.gatherCounts = true - - val maxDomainSize = initDomain() - // Initialize both input and output embedding vectors - private val _inputEmbedding = Array.fill(maxDomainSize)(Weights(new DenseTensor1(dims).fill(() => random.nextDouble()/dims/10 - 0.5/dims/10))) // TODO How should vectors be initialized? /10 good? - private val _outputEmbedding = if (opts.separateIO.value) Array.fill(maxDomainSize)(Weights(new DenseTensor1(dims).fill(() => random.nextDouble()/dims/10 - 0.5/dims/10))) else _inputEmbedding // TODO How should vectors be initialized? /10 good? - private val _discardProb = new Array[Double](maxDomainSize) - - def inputEmbedding(i:Int) = _inputEmbedding(i).value - def inputWeights(i:Int) = _inputEmbedding(i) - def inputEmbedding(word:String) = { val index = domain.index(word); if (index >= 0) _inputEmbedding(index).value else null } - def outputEmbedding(i:Int) = _outputEmbedding(i).value - def outputWeights(i:Int) = _outputEmbedding(i) - def outputEmbedding(word:String) = { val index = domain.index(word); if (index >= 0) _outputEmbedding(index).value else null } - // TODO Should this be here, or should we let subclasses handle this? -akm - if (opts.parametersLoad.wasInvoked) loadParameters(new File(opts.parametersLoad.value)) - - def discardProb(wordIndex: Int): Double = { - var result = _discardProb(wordIndex) - //println("discardProb "+result) - if (result != 0.0) return result - result = 1.0 - math.sqrt(0.0001/(domain.count(wordIndex).toDouble / domain.countsTotal.toDouble)) - _discardProb(wordIndex) = result - //println("discardProb again "+result) - result - } - def discard(wordIndex:Int): Boolean = random.nextDouble() < discardProb(wordIndex) - def discardProbReset(): Unit = { java.util.Arrays.fill(_discardProb, 0.0) } - - /** Initialize the vocabulary into this.domain, either by the incrementalVocabMaxSize or by reading the vocabulary from a file. */ - def initDomain(): Int = { - // Read in the vocabulary - for (splitLine <- Source.fromFile(opts.vocabulary.value).getLines().map(_.split(' '))) domain.indexWithCount(splitLine(1), splitLine(0).toInt) - println("Vocabulary size "+domain.size) - println("Vocabulary total count "+domain.countsTotal) - domain.freeze() - domain.size - } - - def stringToWordIndices(string:String): cc.factorie.util.IntArrayBuffer = { - val wordIndices = new cc.factorie.util.IntArrayBuffer(string.length/4) // guessing average word length > 4 - for (word <- stringToWords(string)) { - val wordIndex = domain.index(word) - if (wordIndex >= 0) wordIndices += wordIndex - } - wordIndices - } - def stringToExamples(string:String): Iterable[WindowWordEmbedderExample] = { - val wordIndices = stringToWordIndices(string) - if (wordIndices.length > opts.window.value) { // TODO was > 2*opts.window.value - val array = wordIndices._rawArray - val windowSize = opts.window.value - // this is slow - for (targetPosition <- 0 until wordIndices.length - windowSize; example <- newExample(this, array, targetPosition, 1 + random.nextInt(windowSize))) yield example - } else Nil - } - def newExample(model:WordEmbedder, wordIndices:Array[Int], centerPosition:Int, window:Int): Option[WindowWordEmbedderExample] - - def train(files:Seq[File]): Unit = { - //val strings = for (filename <- filenames; string <- fileToStringIterator(new File(filename))) yield string - val strings = files.iterator.flatMap(f => fileToStringIterator(f)) - train(strings) - } - - - /** Train the parameters of the embedding given an iterator over the string contents of "documents", - which could be large textual documents like Wikipedia pages or short documents like tweets or titles. - - Every snapshotIncrement training examples, write the embedding parameters to a file - named, for example, embeddings-0010m, for the 10 millionth training window. - If you don't want incremental saving of parameter snapshots, set snapshotIncrement to a negative number. - - Every logIncrement print to stdout how many training windows we have trained on so far. - If you don't want any logging, set logIncrement to a negative number. -*/ - def train(strings:Iterator[String], snapshotIncrement:Int = 10000000, logIncrement:Int = 1000000): Unit = { - if (opts.outputExamples.wasInvoked) println("Writing data examples to "+opts.outputExamples.value) - var wordCount = 0 - var nextWordCountLog = logIncrement - var nextWordCountShapshot = snapshotIncrement - val optimizer = new AdaGrad() - optimizer.initializeWeights(this.parameters) - val trainer = new OnlineTrainer(parameters, optimizer, logEveryN=50000) - for (string <- strings) { - val examples = stringToExamples(string) - //println("CBOW.train examples.size = "+examples.size) - wordCount += examples.size - if (opts.checkGradient.value && wordCount >= 10000) { - print(s"CBOW testGradient ${examples.map(e => e.outputIndices.map(domain.category).mkString(" ")).mkString(" ")} ...") - examples.foreach(e => Example.testGradient(parameters, parameters.keys, e, dx = 1e-7, verbose = true, returnOnFirstError = false)) // TODO Put back "assert" - println("finished.") - } - trainer.processExamples(examples) - if (logIncrement > 0 && wordCount > nextWordCountLog) { println(s" Trained on ${wordCount/1000000}m contexts."); nextWordCountLog += logIncrement } - if (snapshotIncrement > 0 && wordCount > nextWordCountShapshot && !opts.outputExamples.wasInvoked) { - val fn = f"embeddings-${wordCount/1000000}%04dm" - println("Writing intermediate embeddings to "+fn) - writeInputEmbeddings(fn) - nextWordCountShapshot += snapshotIncrement - } - } - if (opts.parametersSave.wasInvoked) saveParameters(new File(opts.parametersSave.value)) - } - - - /** Return a ranked list of domain index/word by their gated output vector's similarity the query vector. - The query vector is assumed to be already gated. The output vectors are gated on the fly here. */ - def neighbors(query:DenseTensor1, count:Int = 10): cc.factorie.util.TopN[String] = { - val top = new cc.factorie.util.TopN[String](count) - for (i <- 0 until domain.size) top += (i, query.cosineSimilarity(_outputEmbedding(i).value), domain.category(i)) - top - } - - /** Return a list of vocabulary indices to use a negative training examples, sampled according to a function of their counts. */ - def makeNegativeSamples: Array[Int] = { - if (opts.useAliasSampling.value) { - val len = opts.negative.value - val ret = new Array[Int](len) - var i = 0 - while (i < len) { - ret(i) = negativeSampler.sample() - i += 1 - } - ret - } else { - val len = opts.negative.value - val ret = new Array[Int](len) - var i = 0 - while (i < len) { - var r = random.nextDouble() - r = r * r * r // Rely on fact that domain is ordered by frequency, so we want to over-sample the earlier entries - ret(i) = (r * domain.size).toInt // TODO Make this better match a Ziph distribution! - i += 1 - } - ret - } - } - lazy val negativeSampler = new cc.factorie.util.Alias(domain.counts.asArray.map(_.toDouble).map(math.pow(_, opts.powerForCounts.value)))(random) - - - /** Interactive browsing of nearest neighbors */ - def browse(): Unit = { - var neighborCount = 10 - def printPrompt(): Unit = { print(s"\n$neighborCount> "); System.out.flush() } - printPrompt() - for (line <- Source.stdin.getLines()) { - val query = new DenseTensor1(dims) - var queryTerms = 0 - for (word <- line.split("\\s+")) word match { - case "\\d+" => neighborCount = word.toInt - case _ => { - val index = domain.index(word) - if (index < 0) println(s"'$word' is outside vocabulary") - else { query += inputEmbedding(index); queryTerms += 1 } - } - } - if (queryTerms > 0) { - val top = neighbors(query, neighborCount) - for (entry <- top) println(f"${entry.category}%-25s ${entry.score}%+08f") - } - printPrompt() - } - } - - var docCount = 0 - /** Read text to build up vocabulary and write the vocabulary to the filename specified by --vocabulary. */ - def buildVocabulary(filenames:Seq[String]): Unit = { - // Recursively gather all files listed on command line - val files = opts.vocabInput.value.flatMap(filename => recurseFiles(new File(filename))) - // Set up a String map to gather counts - val domain = new CategoricalDomain[String] - domain.gatherCounts = true - val printInterval = 100 - val random = new scala.util.Random(0) - val skipThreshold = 1.0 - opts.vocabSkipProb.value - var printAtCount = printInterval - var wordCount = 0 - //var docCount = 0 - // From all files, segment contents into words, and count them - files.foreach(file => { - println("Vocabulary reading "+file.getName()) - for (line <- fileToStringIterator(file)) { - if (docCount >= printAtCount) { - print("\r"+NumberFormat.getNumberInstance(Locale.US).format(docCount)+" articles, "+NumberFormat.getNumberInstance(Locale.US).format(wordCount)+" tokens, "+NumberFormat.getNumberInstance(Locale.US).format(domain.size)+" vocabulary") - //print(s"\r$wikipediaArticleCount articles\t$wordCount words") ; - printAtCount += printInterval - } - for (word <- stringToWords(line)) { - //println("Vocabulary read word "+word) - wordCount += 1 - if (skipThreshold == 0.0 || random.nextDouble() < skipThreshold) // Randomly sample to avoid words appearing only ~1 times. - domain.index(new String(word)) // to avoid memory leak: http://stackoverflow.com/questions/15612157/substring-method-in-string-class-causes-memory-leak - } - } - }) - println(s"Read $docCount documents") - println(s"Read ${domain.countsTotal} tokens, ${domain.size} types.") - // Get rid of words occurring less than 10 times - domain.trimBelowCount(opts.vocabMinCount.value, preserveCounts = true) - println(s"Trimed to ${domain.countsTotal} tokens, ${domain.size} types.") - // Serialize the results - println("Sorting...") - //the line below results in writing an empty vocabulary - //writeVocabulary(opts.vocabulary.value) - val sorted = domain.categories.map(c => (domain.count(c), c)).sortBy(-_._1) - val out = new PrintWriter(opts.vocabulary.value) - for ((count, word) <- sorted) - out.println("%d %s".format(count, word)) - out.close() - println("Done writing vocabulary.") - } - - def writeVocabulary(filename:String): Unit = { - val sorted = domain.categories.map(c => (domain.count(c), c)).sortBy(-_._1) - val out = new PrintWriter(filename) - for ((count, word) <- sorted) - out.println("%d %s".format(count, word)) - out.close() - } - - /** Given a document's string contents, return an iterator over the individual word tokens in the document */ - def stringToWords(string:String): Iterator[String] = { - //alphaSegmenter(string).filter(word => !Stopwords.contains(word.toLowerCase)) - alphaSegmenter(string) - } - - /** Given a file, return an iterator over the string contents of each "document". - Returns nil if the filename suffix is not handled. - */ - /* - def fileToStringIterator(file:File, encoding:String = "UTF8"): Iterator[String] = { - file.getName match { - // Plain text file - case name if name.endsWith(".txt") => Source.fromFile(file, encoding).getLines() - // Compressed text - case name if name.endsWith(".txt.gz") => { - val lineIterator = Source.fromInputStream(new GZIPInputStream(new FileInputStream(file)), encoding).getLines() - var lineCount = 0 - new Iterator[String] { - def hasNext: Boolean = lineIterator.hasNext && lineCount < opts.maxDocuments.value - def next(): String = { - lineCount += 1 - val sb = new StringBuffer - var emptyLine = false - while (lineIterator.hasNext && !emptyLine) { - val line = lineIterator.next() - if (line.length > 0) sb.append(line) - else emptyLine = true - } - sb.toString - } - } - } - case name if name.endsWith(".csv") => - val charset = java.nio.charset.Charset.forName(encoding) - val codec = new scala.io.Codec(charset) - codec.onMalformedInput(java.nio.charset.CodingErrorAction.IGNORE) - val lineIterator = Source.fromFile(file)(codec).getLines() - val cleaningRegex = "\"\\d+\"|<[^>]+>|&[a-z]{3,4};".r - new Iterator[String] { - var stringCount:Long = 0L - var lineCount:Long = 0L - def hasNext: Boolean = lineIterator.hasNext && stringCount < opts.maxDocuments.value //maxStringCount - def next(): String = { - val sb = new StringBuffer - var lineEnd = false - try { - while (lineIterator.hasNext && !lineEnd) { - val line = lineIterator.next() - lineCount += 1 - if (line.length > 0) sb.append(line) - if (!line.endsWith("\\")) lineEnd = true - } - } catch { - case e:Exception => System.err.println("Caught at line "+lineCount+" exception\n"+e) - } - stringCount += 1 - if (stringCount % 1000 == 0) print("\r"+stringCount) - cleaningRegex.replaceAllIn(sb.toString, " ") - } - } - case name if name.startsWith("enwiki") && name.endsWith(".xml.bz2") => - //var wikipediaArticleCount = 0 - val docIterator = cc.factorie.app.nlp.load.LoadWikipediaPlainText.fromCompressedFile(file, opts.maxDocuments.value) - new Iterator[String] { - def hasNext: Boolean = docIterator.hasNext - def next(): String = { docCount += 1; val doc = docIterator.next(); /*println(doc.name);*/ doc.string } - } - // bz2 compress wikipedia XML - case name if name.startsWith("deprecated enwiki") && name.endsWith(".xml.bz2") => { - var wikipediaArticleCount = 0 - val input = new CompressorStreamFactory().createCompressorInputStream(CompressorStreamFactory.BZIP2, new FileInputStream(file)) - val xml = new scala.xml.pull.XMLEventReader(Source.fromInputStream(input)) - //val xml = XMLInputFactory.newInstance().createXMLEventReader(new InputStreamReader(input)) - val cleaningRegex = List( - "\\{\\{[^\\}]*\\}\\}", // Remove everything {{inside}} - "\\{\\|[^\\}]*\\|\\}", // Remove everything {|inside|} - "(? println(e) - case EvElemStart(_, "page", _, _) => { insidePage = true } - case EvElemEnd(_, "page") => { insidePage = false; done = true } - case EvElemStart(_, "text", _, _) => { insideText = true } - case EvElemEnd(_, "text") => { insideText = false; done = true } - case EvText(t) if insideText => { - if (t.startsWith("!--") && !t.endsWith("--")) insideComment = true - else if (t.endsWith("--")) insideComment = false - else if (t.startsWith("ref") && !t.endsWith("/")) insideRef = true - else if (t == "/ref") insideRef = false - else if (!insideRef && !insideComment && !t.startsWith("ref ") && !t.startsWith("#REDIRECT")) { sb append t; sb append ' ' } - } - case _ => // ignore all other tags - } - } - var s = cleaningRegex.replaceAllIn(sb.toString, " ") - s = s.trim; if (s.length > 0) { wikipediaArticleCount += 1; return s } else { /*println(s"Skipping at $wikipediaArticleCount");*/ return next() } - s - } - } - } - case _ => throw new Error("Unknown suffix on document name "+file.getName()) - } - } -*/ - // Recursively descend directory, returning a list of files. - def recurseFiles(directory:File): Seq[File] = { - if (!directory.exists) throw new Error("File "+directory+" does not exist") - if (directory.isFile) return List(directory) - val result = new scala.collection.mutable.ArrayBuffer[File] - for (entry <- directory.listFiles) { - if (entry.isFile) result += entry - else if (entry.isDirectory) result ++= recurseFiles(entry) - } - result - } - - - def writeExamples(inputFilenames:Array[String]): Unit = { - val dataWriter = if (opts.outputExamples.wasInvoked) new OutputStreamWriter(new GZIPOutputStream(new FileOutputStream(opts.outputExamples.value)), "UTF-8") else null - var examplesWrittenCount = 0 - for (filename <- inputFilenames; string <- fileToStringIterator(new File(filename)); example <- stringToExamples(string)) { - examplesWrittenCount += 1 - dataWriter.write(s"$examplesWrittenCount\t${example.outputIndices.map(domain.category(_)).mkString(" ")}\t${example.inputIndices.map(i => domain.category(i)).mkString("\t")}\n") - } - dataWriter.close() - } - - // Writing embedding parameters to a file in textual format - def writeInputEmbeddings(filename:String): Unit = { - val out = new PrintWriter(filename) - for (i <- 0 until domain.size) - out.println("%s\t%s".format(domain.category(i), inputEmbedding(i).mkString(" "))) - out.close() - } - def writeOutputEmbeddings(filename:String): Unit = { - val os = new FileOutputStream(filename) - writeOutputEmbeddings(os) - os.close() - } - def writeOutputEmbeddings(out:OutputStream): Unit = { - val pw = new PrintWriter(out) - for (i <- 0 until domain.size) - pw.println("%s\t%s".format(domain.category(i), outputEmbedding(i).mkString(" "))) - pw.flush() - } - - // Save and load embedding parameters in a compressed binary format - def saveParameters(file:File): Unit = { - val out = new DataOutputStream(new GZIPOutputStream(new FileOutputStream(file))) - saveParameters(out) - out.close() - } - def saveParameters(out:DataOutputStream): Unit = { - out.writeInt(domain.size) - out.writeInt(dims) - out.writeBoolean(opts.separateIO.value) - val size = domain.size; var i, j = 0 - i = 0; while (i < size) { - val inputArray = inputEmbedding(i).asArray - val outputArray = outputEmbedding(i).asArray - j = 0; while (j < dims) { out.writeDouble(inputArray(j)); j += 1 } - if (opts.separateIO.value) { j = 0; while (j < dims) { out.writeDouble(outputArray(j)); j += 1 } } - i += 1 - } - } - def loadParameters(file:File): Unit = { - val in = new DataInputStream(new GZIPInputStream(new FileInputStream(file))) - loadParameters(in) - in.close() - } - def loadParameters(in:DataInputStream): Unit = { - val size = in.readInt(); assert(size == domain.size) - val writtenDims = in.readInt(); assert(writtenDims == dims) - val separateIO = in.readBoolean(); assert(opts.separateIO.value == separateIO) - var i, j = 0 - i = 0; while (i < size) { - val inputArray = inputEmbedding(i).asArray - val outputArray = outputEmbedding(i).asArray - j = 0; while (j < dims) { inputArray(j) = in.readDouble(); j += 1 } - if (separateIO) { j = 0; while (j < dims) { outputArray(j) = in.readDouble(); j += 1 } } - i += 1 - } - } - -} - - -trait IncrementalVocabularyOptions extends WindowWordEmbedderOptions { - val incrementalVocabMaxSize = new CmdOption("incremental-vocab-max-size", 100000, "INT", "When invoked this turns on incremental vocabulary building during training, allowing the vocabulary to grow up to this limit.") - val incrementalVocabMinCount = new CmdOption("incremental-vocab-min-count", 100, "INT", "When doing incremental vocabulary building, don't actually assign a word an embedding vector until it has been seen this many times.") -} - - -trait IncrementalVocabulary extends WordEmbedder { - lazy val incrementalVocabulary = new CategoricalDomain[String] - incrementalVocabulary.gatherCounts = true - - override def initDomain(): Int = { - domain.freeze() - opts.asInstanceOf[IncrementalVocabularyOptions].incrementalVocabMaxSize.value - } - - // Since the vocabulary and counts are growing, we can't just always use the same cached word counts, but must recalcuate them every once in a while - private var _lastDiscardProbResetCount = 0L - override def discardProb(wordIndex: Int): Double = { - if (domain.countsTotal < 1000000) return 0.0000001 - if (domain.countsTotal - _lastDiscardProbResetCount > 10000) { - discardProbReset() - _lastDiscardProbResetCount = domain.countsTotal - } - return super.discardProb(wordIndex) - } - - - override def stringToWordIndices(string:String): cc.factorie.util.IntArrayBuffer = { - val wordIndices = new cc.factorie.util.IntArrayBuffer(string.length/4) // guessing average word length > 4 - for (word <- stringToWords(string)) { - var wordIndex = domain.index(word) - if (wordIndex == -1) { - // The word doesn't yet have an embedding vector - val vi = incrementalVocabulary.index(word) // increment its count in the vocabulary of words that don't yet have an embedding - if (incrementalVocabulary.count(vi) > opts.asInstanceOf[IncrementalVocabularyOptions].incrementalVocabMinCount.value && domain.size < opts.asInstanceOf[IncrementalVocabularyOptions].incrementalVocabMaxSize.value) { - domain.unfreeze() - wordIndex = domain.index(word) // If the count is now above threshold and there is room to grow, then put then give the word an embedding - domain.freeze() - } - } - if (wordIndex >= 0) wordIndices += wordIndex - } - //println("IncrementalVocabulary "+wordIndices.toSeq) - wordIndices - } - -} -*/ \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/nlp/embeddings/CBOWEmbeddingModel.scala b/src/main/scala/cc/factorie/app/nlp/embeddings/CBOWEmbeddingModel.scala deleted file mode 100644 index d304fa7..0000000 --- a/src/main/scala/cc/factorie/app/nlp/embeddings/CBOWEmbeddingModel.scala +++ /dev/null @@ -1,88 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.app.nlp.embeddings -import cc.factorie.la.{DenseTensor1, WeightsMapAccumulator} -import cc.factorie.optimize.Example -import cc.factorie.util.DoubleAccumulator - -import scala.collection.mutable - -class CBOWNegSamplingEmbeddingModel(override val opts: EmbeddingOpts) extends WordEmbeddingModel(opts) { - val negative = opts.negative.value - val window = opts.window.value - val rng = new util.Random(5) // fix the seed; - val sample = opts.sample.value.toDouble - override def process(doc: String): Int = { - // given a document, below line splits by space and converts each word to Int (by vocab.getId) and filters out words not in vocab - var sen = doc.stripLineEnd.split(' ').map(word => vocab.getId(word)).filter(id => id != -1) - val wordCount = sen.size - - // subsampling -> speed increase - if (sample > 0) - sen = sen.filter(id => subSample(id) != -1) - - val senLength = sen.size - for (senPosition <- 0 until senLength) { - val currWord = sen(senPosition) - val b = rng.nextInt(window) - val contexts = new mutable.ArrayBuffer[Int] - // make the contexts - for (a <- b until window * 2 + 1 - b) if (a != window) { - val c = senPosition - window + a - if (c >= 0 && c < senLength) - contexts += sen(c) - } - // make the examples - trainer.processExample(new CBOWNegSamplingExample(this, currWord, contexts, 1)) - (0 until negative).foreach(neg => trainer.processExample(new CBOWNegSamplingExample(this, currWord, List(vocab.getRandWordId), -1))) - } - return wordCount - } - // subsampling - def subSample(word: Int): Int = { - val prob = vocab.getSubSampleProb(word) // pre-computed to avoid sqrt call every time. Improvement of 10 secs on 100MB data ~ 15 MINs on 10GB - val alpha = rng.nextInt(0xFFFF) / 0xFFFF.toDouble - if (prob < alpha) { return -1 } - else return word - } -} -class CBOWNegSamplingExample(model: WordEmbeddingModel, word: Int, contexts: Seq[Int], label: Int) extends Example { - - // to understand the gradient and objective refer to : http://arxiv.org/pdf/1310.4546.pdf - def accumulateValueAndGradient(value: DoubleAccumulator, gradient: WeightsMapAccumulator): Unit = { - - val wordEmbedding = model.weights(word).value - val contextEmbedding = new DenseTensor1(model.D, 0) - contexts.foreach(context => contextEmbedding.+=(model.weights(context).value)) - - val score: Double = wordEmbedding.dot(contextEmbedding) - val exp: Double = math.exp(-score) // TODO : pre-compute , costly operation - - var objective: Double = 0.0 - var factor: Double = 0.0 - if (label == 1) { - objective = -math.log1p(exp) - factor = exp / (1 + exp) - } - if (label == -1) { - objective = -score - math.log1p(exp) - factor = -1 / (1 + exp) - } - if (value ne null) value.accumulate(objective) - if (gradient ne null) { - contexts.foreach(context => gradient.accumulate(model.weights(context), wordEmbedding, factor)) - gradient.accumulate(model.weights(word), contextEmbedding, factor) - } - - } -} diff --git a/src/main/scala/cc/factorie/app/nlp/embeddings/Distance.scala b/src/main/scala/cc/factorie/app/nlp/embeddings/Distance.scala deleted file mode 100644 index cdc87a9..0000000 --- a/src/main/scala/cc/factorie/app/nlp/embeddings/Distance.scala +++ /dev/null @@ -1,115 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -/* -package cc.factorie.app.nlp.embeddings -import cc.factorie.la.DenseTensor1 - -import scala.collection.mutable.PriorityQueue -import scala.io.Source -*/ -/* -object EmbeddingDistance { - - var threshold = 0 - var vocab = Array[String]() - var weights = Array[DenseTensor1]() - var D = 0 - var V = 0 - var top = 30 - def main(args: Array[String]) { - - if (args.size != 1) { - println("Input vectors file missing. USAGE : distance vectors.txt") - return - } - val inputFile = args(0) - load(inputFile) - play() - } - - def run(opts:EmbeddingOpts): Unit = { - load(opts.explore.value, opts.encoding.value) - play() - } - - def nearestNeighbours(inputFile: String, numNeighbours: Int = 30): Unit = { - top = numNeighbours - load(inputFile) - play() - } - def load(embeddingsFile: String, encoding:String = "UTF8"): Unit = { - var lineItr = Source.fromFile(embeddingsFile, encoding).getLines - // first line is (# words, dimension) - val details = lineItr.next.stripLineEnd.split(' ').map(_.toInt) - V = if (threshold > 0 && details(0) > threshold) threshold else details(0) - D = details(1) - println("# words : %d , # size : %d".format(V, D)) - vocab = new Array[String](V) - weights = new Array[DenseTensor1](V) - for (v <- 0 until V) { - val line = lineItr.next.stripLineEnd.split(' ') - vocab(v) = line(0).toLowerCase - weights(v) = new DenseTensor1(D, 0) // allocate the memory - for (d <- 0 until D) weights(v)(d) = line(d + 1).toDouble - weights(v) /= weights(v).twoNorm - } - println("loaded vocab and their embeddings") - } - def play(): Unit = { - - while (true) { - print("Enter word (EXIT to break) : ") - Source.stdin.getLines().flatMap(_.stripLineEnd.toLowerCase.split(' ').map(getID)).filter(_ != -1) - val words = Source.stdin.getLines().flatMap(_.stripLineEnd.toLowerCase.split(' ').map(getID)).filter(_ != -1).toArray - if (words.size == 0) { - println("words not in vocab") - } else { - val embedding_in = new DenseTensor1(D, 0) - words.foreach(word => embedding_in.+=(weights(word))) - embedding_in./=(words.size) - val pq = new PriorityQueue[(String, Double)]()(dis) - for (i <- 0 until vocab.size) if (words.size != 1 || !words(0).equals(vocab(i))) { - val embedding_out = weights(i) - val score = TensorUtils.cosineDistance(embedding_in, embedding_out) - if (i < top) pq.enqueue(vocab(i) -> score) - else if (score > pq.head._2) { // if the score is greater the min, then add to the heap - pq.dequeue - pq.enqueue(vocab(i) -> score) - } - } - var arr = new Array[(String, Double)](pq.size) - var i = 0 - while (!pq.isEmpty) { // min heap - arr(i) = (pq.head._1, pq.head._2) - i += 1 - pq.dequeue - } - println("\t\t\t\t\t\tWord\t\tCosine Distance") - arr.reverse.foreach(x => println("%50s\t\t%f".format(x._1, x._2))) - - } - } - } - // private helper functions - private def dis() = new Ordering[(String, Double)] { - def compare(a: (String, Double), b: (String, Double)) = -a._2.compare(b._2) - } - private def getID(word: String): Int = { - for (i <- 0 until vocab.length) if (vocab(i).equalsIgnoreCase(word)) - return i - return -1 - - } - -} -*/ \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/nlp/embeddings/EmbeddingOpts.scala b/src/main/scala/cc/factorie/app/nlp/embeddings/EmbeddingOpts.scala deleted file mode 100644 index 94f58fd..0000000 --- a/src/main/scala/cc/factorie/app/nlp/embeddings/EmbeddingOpts.scala +++ /dev/null @@ -1,51 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.app.nlp.embeddings -import cc.factorie.util.CmdOptions - -class EmbeddingOpts extends CmdOptions { - - // Algorithm related - val dimension = new CmdOption("size", 200, "INT", "use size of word vectors") - val window = new CmdOption("window", 5, "INT", "use skip length between words") - val threads = new CmdOption("threads", 12, "INT", "use threads") - val negative = new CmdOption("negative", 1, "INT", "use number of negative examples") - val minCount = new CmdOption("min-count", 5, "INT", "This will discard words that appear less than times; default is 5") - val ignoreStopWords = new CmdOption("ignore-stopwords", false, "BOOLEAN", "use to include or discard stopwords. Use 1 for discarding stopwords") - val cbow = new CmdOption("cbow", false, "BOOLEAN", "user cbow=true for cbow and cbow=false for skip-gram") // 1 would be SkipGram // default method is skipgram - val sample = new CmdOption("sample", 0.001, "DOUBLE", "use subsampling") - val numIterations = new CmdOption("num-iterations", 1,"INT", "The number of iterations of training to run") - - // Optimization related (Don't change if you do not understand how vectors are initialized) - val rate = new CmdOption("rate", 0.025, "DOUBLE", "learning rate for adaGrad") - val delta = new CmdOption("delta", 0.1, "DOUBLE", "delta for adaGrad") - - // IO Related (MUST GIVE Options) - val encoding = new CmdOption("encoding", "UTF8", "STRING", "use for encoding option. ISO-8859-15 is default") - val saveVocabFile = new CmdOption("save-vocab", "", "STRING", "save vocab file") - val loadVocabFile = new CmdOption("load-vocab", "", "STRING", "load the vocab file") // atleast one of them should be given. save-vocab or load-vocab - val corpus = new CmdOption("train", "", "STRING", "train file") - val output = new CmdOption("output", "", "STRING", "Use to save the resulting word vectors") - val binary = new CmdOption("binary", false, "BOOLEAN", "use true for storing .gz format and false for plain txt format. Both stores in ISO-8859-15 Encoding") - - // Reading embeddings and finding close neighbors. (Not training embeddings.) - val explore = new CmdOption("explore", "embeddings.txt", "FILE", "Filename from which to read already-learned embeddings; the result of --output in a --train run.") - - // Vocabulary related - // Maximum 14.3M * 0.7 = 10M words in the vocabulary (Don;t change if you understand how vocabBuilder works) - val vocabSize = new CmdOption("max-vocab-size", 2e6.toInt, "INT", "Max Vocabulary Size. Default Value is 2M . Reduce to 200k or 500k is you learn embeddings on small-data-set") - val vocabHashSize = new CmdOption("vocab-hash-size", 14.3e6.toInt, "INT", "Vocabulary hash size") - val samplingTableSize = new CmdOption("sampling-table-size", 1e8.toInt, "INT", "Sampling Table size") - - -} diff --git a/src/main/scala/cc/factorie/app/nlp/embeddings/LiteHogWildTrainer.scala b/src/main/scala/cc/factorie/app/nlp/embeddings/LiteHogWildTrainer.scala deleted file mode 100644 index 01b1ece..0000000 --- a/src/main/scala/cc/factorie/app/nlp/embeddings/LiteHogWildTrainer.scala +++ /dev/null @@ -1,33 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.app.nlp.embeddings -import cc.factorie.la.SmartGradientAccumulator -import cc.factorie.model.WeightsSet -import cc.factorie.optimize.{Example, GradientOptimizer, Trainer} -import cc.factorie.util.{LocalDoubleAccumulator, Threading} - -class LiteHogwildTrainer(val weightsSet: WeightsSet, val optimizer: GradientOptimizer, val nThreads: Int = Runtime.getRuntime.availableProcessors(), val maxIterations: Int = 3) - extends Trainer { - - var iteration = 0 - def processExample(e: Example): Unit = { - val gradientAccumulator = new SmartGradientAccumulator - val value = new LocalDoubleAccumulator() - e.accumulateValueAndGradient(value, gradientAccumulator) - optimizer.step(weightsSet, gradientAccumulator.getMap, value.value) - } - def processExamples(examples: Iterable[Example]): Unit = { - Threading.parForeach(examples.toSeq, nThreads)(processExample(_)) - } - def isConverged = iteration >= maxIterations -} diff --git a/src/main/scala/cc/factorie/app/nlp/embeddings/SkipGramEmbedding.scala b/src/main/scala/cc/factorie/app/nlp/embeddings/SkipGramEmbedding.scala index 0e43888..b74a4f6 100644 --- a/src/main/scala/cc/factorie/app/nlp/embeddings/SkipGramEmbedding.scala +++ b/src/main/scala/cc/factorie/app/nlp/embeddings/SkipGramEmbedding.scala @@ -18,6 +18,7 @@ import java.util.zip.GZIPInputStream import cc.factorie.la import cc.factorie.util.ClasspathURL +import scala.io object SkipGramEmbedding extends SkipGramEmbedding(s => ClasspathURL.fromDirectory[SkipGramEmbedding](s).openConnection().getInputStream, 100) diff --git a/src/main/scala/cc/factorie/app/nlp/embeddings/SkipGramEmbeddingModel.scala b/src/main/scala/cc/factorie/app/nlp/embeddings/SkipGramEmbeddingModel.scala deleted file mode 100644 index f57238b..0000000 --- a/src/main/scala/cc/factorie/app/nlp/embeddings/SkipGramEmbeddingModel.scala +++ /dev/null @@ -1,85 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.app.nlp.embeddings -import cc.factorie.la.WeightsMapAccumulator -import cc.factorie.optimize.Example -import cc.factorie.util.DoubleAccumulator - -class SkipGramNegSamplingEmbeddingModel(override val opts: EmbeddingOpts) extends WordEmbeddingModel(opts) { - val negative = opts.negative.value - val window = opts.window.value - val rng = new util.Random(5) // set rng seed - val sample = opts.sample.value.toDouble - override def process(doc: String): Int = { - // given a document, below line splits by space and converts each word to Int (by vocab.getId) and filters out words not in vocab - var sen = doc.stripLineEnd.split(' ').map(word => vocab.getId(word)).filter(id => id != -1) - val wordCount = sen.size - - // subsampling -> speed increase - if (sample > 0) - sen = sen.filter(id => subSample(id) != -1) - - val senLength = sen.size - for (senPosition <- 0 until senLength) { - val currWord = sen(senPosition) - val b = rng.nextInt(window) - // make the contexts - val contexts = new collection.mutable.ArrayBuffer[Int] - for (a <- b until window * 2 + 1 - b) if (a != window) { - val c = senPosition - window + a - if (c >= 0 && c < senLength) - contexts += sen(c) - } - // make the examples - contexts.foreach(context => { - trainer.processExample(new SkipGramNegSamplingExample(this, currWord, context, 1)) - (0 until negative).foreach(neg => trainer.processExample(new SkipGramNegSamplingExample(this, currWord, vocab.getRandWordId, -1))) - }) - } - return wordCount - } - - def subSample(word: Int): Int = { - val prob = vocab.getSubSampleProb(word) // pre-computed to avoid sqrt call every time. - val alpha = rng.nextInt(0xFFFF) / 0xFFFF.toDouble - if (prob < alpha) { return -1 } - else return word - } -} -class SkipGramNegSamplingExample(model:WordEmbeddingModel, word: Int, context: Int, label: Int) extends Example { - - // to understand the gradient and objective refer to : http://arxiv.org/pdf/1310.4546.pdf - def accumulateValueAndGradient(value: DoubleAccumulator, gradient: WeightsMapAccumulator): Unit = { - val wordEmbedding = model.weights(word).value // access the word's embedding - val contextEmbedding = model.weights(context).value // access the context's embedding - val score: Double = wordEmbedding.dot(contextEmbedding) - val exp: Double = math.exp(-score) // TODO : pre-compute exp table - var objective: Double = 0.0 - var factor: Double = 0.0 - if (label == 1) { - objective = -math.log1p(exp) - factor = exp / (1 + exp) - } - if (label == -1) { - objective = -score - math.log1p(exp) - factor = -1 / (1 + exp) - } - if (value ne null) value.accumulate(objective) - if (gradient ne null) { - gradient.accumulate(model.weights(word), contextEmbedding, factor) - gradient.accumulate(model.weights(context), wordEmbedding, factor) - } - - } -} - diff --git a/src/main/scala/cc/factorie/app/nlp/embeddings/VocabBuilder.scala b/src/main/scala/cc/factorie/app/nlp/embeddings/VocabBuilder.scala deleted file mode 100644 index 4e39f93..0000000 --- a/src/main/scala/cc/factorie/app/nlp/embeddings/VocabBuilder.scala +++ /dev/null @@ -1,242 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.app.nlp.embeddings -import java.io.{BufferedOutputStream, FileInputStream, FileOutputStream, OutputStreamWriter} -import java.util.zip.{GZIPInputStream, GZIPOutputStream} - -import cc.factorie.app._ - -import scala.util.Random - - -class VocabBuilder(vocab_hash_size: Int = 20e6.toInt, sampling_table_size: Int = 1e8.toInt, load_factor: Double = 0.7) { - - // Highly Optimized hash table using linear probing - private var vocab_max_size = 1000 - private var vocab_size = 0 - private var min_reduce = 1 - private val power: Double = 0.75 - - // assign internal params - private val rng = new Random(5) // fix the seed - private var train_words = 0 - private var prev_vocab_size = 0 - - // allocate data structures - private var vocab_hash = new Array[Int](vocab_hash_size) // vocab_hash stores pointer to the vocab array - private var vocab = new Array[vocab_word](vocab_max_size) // vocab array holds the (word, cnt) pairs. In future, huffman encoding will be added. - private var sampling_table = new Array[Int](sampling_table_size) - private var sub_sampling_table = new Array[Double](vocab_size) - - (0 until vocab_hash_size).foreach(i => vocab_hash(i) = -1) - (0 until vocab_max_size).foreach(i => vocab(i) = null) - - def size(): Int = vocab_size - def trainWords(): Long = { - // compute only if the vocab has changed . - if (prev_vocab_size != vocab_size) { - train_words = 0 - vocab.foreach(word => train_words += word.cn) - prev_vocab_size = vocab_size - } - train_words - } - - def addWordToVocab(key: String): Unit = { - val id = getId(key) - if (id == -1) { - vocab(vocab_size) = new vocab_word(1, key) - vocab_size += 1 - // resize vocab array - if (vocab_size + 2 >= vocab_max_size) { - vocab_max_size += 1000 - var vocab1 = new Array[vocab_word](vocab_max_size) - (0 until vocab_size).foreach(i => vocab1(i) = vocab(i)) - (vocab_size until vocab_max_size).foreach(i => vocab1(i) = null) - vocab = vocab1 - } - var hash = get_word_hash(key) - while (vocab_hash(hash) != -1) { - hash = (hash + 1) % vocab_hash_size - } - vocab_hash(hash) = vocab_size - 1 - } else vocab(id).cn += 1 - // resize vocab_hash array - if (vocab_size > vocab_hash_size * load_factor) { - reduce_vocab(min_reduce) - min_reduce += 1 - } - - } - - def getCount(word: String): Int = { - var hash = get_word_hash(word) - var a = -1 - while (true) { - a = vocab_hash(hash) - if (a == -1) return -1 - if (vocab(a).wrd.equals(word)) return vocab(a).cn - hash = (hash + 1) % vocab_hash_size - } - return -1 - } - - def getCount(id: Int): Int = { - vocab(id).cn - } - - def getId(word: String): Int = { - var hash = get_word_hash(word) - var a = -1 - while (true) { - a = vocab_hash(hash) - if (a == -1) return -1 - if (vocab(a).wrd.equals(word)) return a - hash = (hash + 1) % vocab_hash_size - } - return -1 - } - - def getWord(id: Int): String = vocab(id).wrd - - // step-1 :first removes the word whose count < min_count , then sorts it - // step-2 : removes the stop-words - stops words from factorie and word length should be greater than 1. Will ignore punctutations - // step-3 : if vocab_size > max_vocab_size, then only take max_vocab_size - def sortVocab(min_count: Int = 5, ignoreStopWords: Int = 0, max_vocab_size: Int = 2e6.toInt): Unit = { - for (a <- vocab_size until vocab.size) vocab(a) = null - vocab = vocab.filter(ele => (ele != null && ele.cn >= min_count)).sortWith((x, y) => y.cn < x.cn) - if (ignoreStopWords == 1) { - vocab = vocab.filter(w => !nlp.lexicon.StopWords.containsWord(w.wrd) && w.wrd.size > 1 ) - } - if (vocab.size > max_vocab_size) - vocab = vocab.take(max_vocab_size) - vocab_size = vocab.size - rehash() - - } - // Vocab IO . Default option of storing is plain txt - def saveVocab(filename: String, binary: Int = 0, encoding: String = "UTF8"): Unit = { - var out = binary match { - case 0 => new java.io.PrintWriter(filename, encoding) - case 1 => new OutputStreamWriter(new GZIPOutputStream(new BufferedOutputStream(new FileOutputStream(filename))), encoding) - } - for (i <- 0 until vocab_size) { - out.write(vocab(i).wrd + " " + vocab(i).cn + "\n") // format : . TODO : JSON/XML in future ? - assert(i == getId(vocab(i).wrd)) - out.flush() - } - out.close() - } - - // load the vocab file - // format should be - // will also figure out if filename is binary or not based on filetype - def loadVocab(filename: String, encoding: String = "UTF8"): Unit = { - var in = filename.endsWith(".gz") match { - case true => io.Source.fromInputStream(new GZIPInputStream(new FileInputStream(filename)), encoding).getLines - case false => io.Source.fromFile(filename, encoding).getLines - } - for (line <- in) { - val wordCntdetail = line.stripLineEnd.split(' ') - assert(wordCntdetail.size == 2) - val word = wordCntdetail(0) - val cnt = wordCntdetail(1).toInt - addWordToVocab(word) - vocab(getId(word)).cn = cnt // Assumption : vocab file does have duplicate words - } - } - - // Sampling Functions : same as Google's word2vec - def buildSamplingTable(): Unit = { - var i = 0 - var train_words_pow: Long = 0 - var running_word_proportion: Double = 0 - for (a <- 0 until vocab_size) { - train_words_pow += math.pow(vocab(a).cn, power).toLong - if (train_words_pow < 0) { - println("train_words_pow went negative") - println("ERROR: buildSamplingTable failed") - return - } - } - i = 0 - running_word_proportion = math.pow(vocab(i).cn, power) / train_words_pow.toDouble - for (a <- 0 until sampling_table_size) { - sampling_table(a) = i - if (a / sampling_table_size.toDouble > running_word_proportion) { - i += 1 - running_word_proportion += math.pow(vocab(i).cn, power) / train_words_pow.toDouble - } - if (i >= vocab_size) i = vocab_size - 1 - } - } - // Sub Sampling : same as Google's word2vec - def buildSubSamplingTable(sample: Double): Unit = { - trainWords() - if (sample > 0) { - sub_sampling_table = new Array[Double](vocab_size) - for (a <- 0 until vocab_size) { - val cnt = vocab(a).cn - val ran = (math.sqrt(cnt / (sample * train_words)) + 1) * (sample * train_words) / cnt - sub_sampling_table(a) = ran - } - } - } - - def getRandWordId(): Int = sampling_table(rng.nextInt(sampling_table_size)) - def getRandWord(): String = vocab(getRandWordId).wrd - def getSubSampleProb(id: Int): Double = sub_sampling_table(id) - - // TODO huffman encoding for Hierarchical SoftMax: def BuildBinaryTree = { } - - // helper functions - // reduce_vocab is hacky function which makes sure vocab can handle any kind of streamming words - private def reduce_vocab(min_reduce: Int): Unit = { - var a = 0 - var b = 0 - // why not use vocab.filter() ? ans : filter would shrink the vocab array. We dont want that . - // keep only those words whose cnt > min_reduce - for (a <- 0 until vocab_size) if (vocab(a).cn > min_reduce) { - vocab(b) = vocab(a) - b += 1 - } else vocab(a) = null - vocab_size = b - // now, rehash - rehash() - - } - // hash function for the string - private def get_word_hash(word: String): Int = { - var hash: Long = 0 // made Long to avoid overflow - for (ch <- word) - hash = (hash * 257 + ch) % vocab_hash_size - hash = hash % vocab_hash_size - return hash.toInt - } - - private def rehash(): Unit = { - (0 until vocab_hash_size).foreach(i => vocab_hash(i) = -1) - for (a <- 0 until vocab_size) { - var hash = get_word_hash(vocab(a).wrd) - while (vocab_hash(hash) != -1) hash = (hash + 1) % vocab_hash_size - vocab_hash(hash) = a - } - } - -} - -class vocab_word(cnt: Int = 0, w: String) { - var cn = cnt - var wrd = w - override def toString() = "( " + cn + ", " + wrd + " ) " -} diff --git a/src/main/scala/cc/factorie/app/nlp/embeddings/WordEmbedding.scala b/src/main/scala/cc/factorie/app/nlp/embeddings/WordEmbedding.scala deleted file mode 100644 index dac0c49..0000000 --- a/src/main/scala/cc/factorie/app/nlp/embeddings/WordEmbedding.scala +++ /dev/null @@ -1,54 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.app.nlp.embeddings - -import java.util.zip.GZIPInputStream - -import cc.factorie.la - -class WordEmbedding(val inputStreamFactory: () => java.io.InputStream, val dimensionSize: Int,numTake: Int = -1) extends scala.collection.mutable.LinkedHashMap[String,la.DenseTensor1] { - def sourceFactory(): io.Source = io.Source.fromInputStream(new GZIPInputStream(inputStreamFactory()),"iso-8859-1") - - println("Reading Word Embeddings with dimension: %d".format(dimensionSize)) - - initialize() - def initialize() { - val source = sourceFactory() - var count = 0 - val lines = if(numTake > 0) source.getLines().take(numTake) else source.getLines() - val firstLine = lines.next() - val firstFields = firstLine.split("\\s+") - val numLines = firstFields(0).toInt - val dimension = firstFields(1).toInt - assert(dimension == dimensionSize,"the specified dimension %d does not agree with the dimension %d given in the input file".format(dimension,dimensionSize)) - - for (line <- lines) { - val fields = line.split("\\s+") - val tensor = new la.DenseTensor1(fields.drop(1).map(_.toDouble)) - assert(tensor.dim1 == dimensionSize,"the tensor has length " + tensor.dim1 + " , but it should have length + " + dimensionSize) - this(fields(0)) = tensor - count += 1 - if (count % 100000 == 0) println("word vector count: %d".format(count)) - } - source.close() - } - -} - -trait WordEmbeddingOptions extends cc.factorie.util.CmdOptions { - val useEmbeddings = new CmdOption("use-embeddings",false,"BOOLEAN","Whether to use word embeddings") - val embeddingFile = new CmdOption("embedding-file", "", "STRING", "path to word2vec format file") - val embeddingDim = new CmdOption("embedding-dim", 100, "INT", "embedding dimension") - val embeddingScale = new CmdOption("embedding-scale", 10.0, "FLOAT", "The scale of the embeddings") - val numEmbeddingsToTake = new CmdOption("num-embeddings-to-take",-1,"INT","how many embeddings to take (assuming the file is sorted by word frequency. Default takes all of them") -} \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/nlp/embeddings/WordEmbeddingModel.scala b/src/main/scala/cc/factorie/app/nlp/embeddings/WordEmbeddingModel.scala deleted file mode 100644 index f4092f8..0000000 --- a/src/main/scala/cc/factorie/app/nlp/embeddings/WordEmbeddingModel.scala +++ /dev/null @@ -1,143 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.app.nlp.embeddings -import java.io.{BufferedOutputStream, File, FileInputStream, FileOutputStream, OutputStreamWriter} -import java.util.zip.{GZIPInputStream, GZIPOutputStream} - -import cc.factorie.la.DenseTensor1 -import cc.factorie.model.{Parameters, Weights} -import cc.factorie.optimize.AdaGradRDA -import cc.factorie.util.Threading - -abstract class WordEmbeddingModel(val opts: EmbeddingOpts) extends Parameters { - - // Algo related - val D = opts.dimension.value // default value is 200 - var V: Int = 0 // vocab size. Will computed in buildVocab() section - protected val numIterations = opts.numIterations.value // default value is 1 - protected val threads = opts.threads.value // default value is 12 - protected val adaGradDelta = opts.delta.value // default value is 0.1 - protected val adaGradRate = opts.rate.value // default value is 0.025 - protected val minCount = opts.minCount.value // default value is 5 - protected val ignoreStopWords = if (opts.ignoreStopWords.value == true) 1 else 0 // default value is 0 - protected val vocabHashSize = opts.vocabHashSize.value // default value is 20 M. load factor is 0.7. So, Vocab size = 0.7 * 20M = 14M vocab supported which is sufficient enough for large amounts of data - protected val samplingTableSize = opts.samplingTableSize.value // default value is 100 M - protected val maxVocabSize = opts.vocabSize.value - - // IO Related - val corpus = opts.corpus.value // corpus input filename. Code takes cares of .gz extension - protected val outputFilename = opts.output.value // embeddings output filename - private val storeInBinary = if(opts.binary.value == true) 1 else 0 // binary=1 will make both vocab file (optional) and embeddings in .gz file - private val loadVocabFilename = opts.loadVocabFile.value // load the vocab file. Very useful for large corpus should you run multiple times - private val saveVocabFilename = opts.saveVocabFile.value // save the vocab into a file. Next time for the same corpus, load it . Saves lot of time on large corpus - private val encoding = opts.encoding.value // Default is UTF8 - // data structures - protected var vocab: VocabBuilder = null - protected var trainer: LiteHogwildTrainer = null // modified version of factorie's hogwild trainer for speed by removing logging and other unimportant things. Expose processExample() instead of processExamples() - protected var optimizer: AdaGradRDA = null - - var weights: Seq[Weights] = null // EMBEDDINGS . Will be initialized in learnEmbeddings() after buildVocab() is called first - private var train_words: Long = 0 // total # of words in the corpus. Needed to calculate the distribution of the work among threads and seek points of corpus file - - // Component-1 - def buildVocab(): Unit = { - vocab = new VocabBuilder(vocabHashSize, samplingTableSize, 0.7) // 0.7 is the load factor - if (loadVocabFilename.size == 0) { - println("Building Vocab") - val corpusLineItr = corpus.endsWith(".gz") match { - case true => io.Source.fromInputStream(new GZIPInputStream(new FileInputStream(corpus)), encoding).getLines - case false => io.Source.fromInputStream(new FileInputStream(corpus), encoding).getLines - } - while (corpusLineItr.hasNext) { - val line = corpusLineItr.next - line.stripLineEnd.split(' ').foreach(word => vocab.addWordToVocab(word)) // addWordToVocab() will incr by count. TODO : make this also parallel ? but it is an one time process, next time use load-vocab option - } - } else { - println("Loading Vocab") - vocab.loadVocab(loadVocabFilename, encoding) - } - vocab.sortVocab(minCount, ignoreStopWords, maxVocabSize) // removes words whose count is less than minCount and sorts by frequency - vocab.buildSamplingTable() // for getting random word from vocab in O(1) otherwise would O(log |V|) - vocab.buildSubSamplingTable(opts.sample.value) // precompute subsampling table - V = vocab.size() - train_words = vocab.trainWords() - println("Corpus Stat - Vocab Size :" + V + " Total words (effective) in corpus : " + train_words) - // save the vocab if the user provides the filename save-vocab - if (saveVocabFilename.size != 0) { - println("Saving Vocab into " + saveVocabFilename) - vocab.saveVocab(saveVocabFilename, storeInBinary, encoding) // for every word, - println("Done Saving Vocab") - } - - } - - // Component-2 - def learnEmbeddings(): Unit = { - println("Learning Embeddings") - optimizer = new AdaGradRDA(delta = adaGradDelta, rate = adaGradRate) - weights = (0 until V).map(i => Weights(TensorUtils.setToRandom1(new DenseTensor1(D, 0)))) // initialized using wordvec random - optimizer.initializeWeights(this.parameters) - trainer = new LiteHogwildTrainer(weightsSet = this.parameters, optimizer = optimizer, nThreads = threads, maxIterations = Int.MaxValue) - val threadIds = (0 until threads).map(i => i) - val fileLen = new File(corpus).length - (1 to numIterations).foreach { iteration => - println(s"Beginning Training Iteration $iteration") - Threading.parForeach(threadIds, threads)(threadId => workerThread(threadId, fileLen)) - } - println("Done learning embeddings. ") - //store() - } - - // Component-3 - def store(): Unit = { - println("Now, storing the embeddings .... ") - val out = storeInBinary match { - case 0 => new java.io.PrintWriter(outputFilename, encoding) - case 1 => new OutputStreamWriter(new GZIPOutputStream(new BufferedOutputStream(new FileOutputStream(outputFilename))), encoding) - } - // format : - // - // []*dim-size - out.write("%d %d\n".format(V, D)) - for (v <- 0 until V) { - out.write(vocab.getWord(v) + " " ) - val embedding = weights(v).value - for (d <- 0 until D) - out.write(embedding(d) + " ") - out.write("\n") - out.flush() - } - out.close() - println("Done storing embeddings") - } - - protected def workerThread(id: Int, fileLen: Long, printAfterNDoc: Long = 100): Unit = { - val skipBytes: Long = fileLen / threads * id // fileLen now pre-computed before pasing to all threads. skip bytes. skipped bytes is done by other workers - val lineItr = new FastLineReader(corpus, skipBytes, encoding) - var word_count: Long = 0 - var work = true - var ndoc = 0 - val total_words_per_thread = train_words / threads // worker amount . - while (lineItr.hasNext && work) { - word_count += process(lineItr.next) // Design choice : should word count be computed here and just expose process(doc : String): Unit ?. - ndoc += 1 - if (id == 1 && ndoc % printAfterNDoc == 0) { // print the process after processing 100 docs in 1st thread. It approx reflects the total progress - println("Progress : " + word_count / total_words_per_thread.toDouble * 100 + " %") - } - if (word_count > total_words_per_thread) work = false // Once, word_count reaches this limit, ask worker to end - } - } - - // override this function in your Embedding Model like SkipGramEmbedding or CBOWEmbedding - protected def process(doc: String): Int -} diff --git a/src/main/scala/cc/factorie/app/nlp/embeddings/WordEmbeddingUtils.scala b/src/main/scala/cc/factorie/app/nlp/embeddings/WordEmbeddingUtils.scala deleted file mode 100644 index a2820a5..0000000 --- a/src/main/scala/cc/factorie/app/nlp/embeddings/WordEmbeddingUtils.scala +++ /dev/null @@ -1,102 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.app.nlp.embeddings -import java.io.{BufferedReader, FileInputStream, InputStreamReader} -import java.util.zip.GZIPInputStream - -import cc.factorie.la._ - -import scala.util.Random - -class FastWordReader(file: String, encoding: String = "UTF8") extends Iterator[String] { - private var in = file.endsWith(".gz") match { - case false => new BufferedReader(new InputStreamReader(new FileInputStream(file), encoding)) - case true => new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(file)), encoding)) - } - private var sb: StringBuilder = null - moveToNext() - - private def moveToNext(): Unit = { - sb = new StringBuilder() - var c = in.read // read char - // go inside only if c is bad char - while (c != -1 && (c == '\n' || c == '\t' || c == ' ' || c == '\r')) c = in.read - - // go inside only if c is good char - while (c != -1 && c != '\n' && c != '\t' && c != ' ' && c != '\r') { - sb.+=(c.toChar) // add the good char - c = in.read() // read next char - } - } - - def hasNext(): Boolean = sb.length() > 0 - def next(): String = { moveToNext; sb.toString } - -} - -class FastLineReader(file: String, skipBytes: Long = 0, encoding: String = "UTF8") extends Iterator[String] { - //private var in = new FileReader(file) - private var sb: StringBuilder = null - private var line: String = null - private var in = file.endsWith(".gz") match { - case false => new BufferedReader(new InputStreamReader(new FileInputStream(file), encoding)) - case true => new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(file)), encoding)) - } - in.skip(skipBytes) - moveToNext() - - private def moveToNext(): Unit = { - sb = new StringBuilder() - var c = in.read() // read char . Internally, read 1 byte or 2 byte or 3 bytes depending encoding. - // go inside only if c is bad char - while (c != -1 && c == '\n') c = in.read - - // go inside only if c is good char - while (c != -1 && c != '\n') { - sb.+=(c.toChar) // add the good char - c = in.read() // read next char - } - } - - def hasNext(): Boolean = { - return sb.length() > 0 - } - def next(): String = { moveToNext; sb.toString } - -} - -object TensorUtils { - val rng = new Random(5) // fix the seed - def cosineDistance(x: Tensor1, y: Tensor1): Double = { - val xnorm = x.twoNorm - val ynorm = y.twoNorm - x.dot(y) / (xnorm * ynorm) - - } - // random initialization is done in the same way as google's word2vec. - def setToRandom1(t: DenseTensor1): DenseTensor1 = { - for (i <- 0 until t.length) - t(i) = (rng.nextInt(Int.MaxValue) / (Int.MaxValue - 1).toDouble) / t.length - t - } - def setToRandom2(t : DenseTensor2): DenseTensor2 = { - val V = t.dim1 - val D = t.dim2 - for (v <- 0 until V) - for (d <- 0 until D) - t(v,d) = (rng.nextInt(Int.MaxValue) / (Int.MaxValue - 1).toDouble) / D - t - } - -} - diff --git a/src/main/scala/cc/factorie/app/nlp/embeddings/WordVec.scala b/src/main/scala/cc/factorie/app/nlp/embeddings/WordVec.scala deleted file mode 100644 index d920c68..0000000 --- a/src/main/scala/cc/factorie/app/nlp/embeddings/WordVec.scala +++ /dev/null @@ -1,43 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -//package cc.factorie.app.nlp.embeddings -import java.nio.charset.Charset -/* -object WordVec { - def main(args: Array[String]) { - val opts = new EmbeddingOpts - opts.parse(args) - println("Default Charset of this JVM=" + Charset.defaultCharset()); - println("User Provided Charset for this project=" + opts.encoding.value) - - if (opts.explore.wasInvoked) { - EmbeddingDistance.run(opts) - return - } - - val wordEmbedding = if (opts.cbow.value == true) new CBOWNegSamplingEmbeddingModel(opts) else new SkipGramNegSamplingEmbeddingModel(opts) - val st1 = System.currentTimeMillis() - wordEmbedding.buildVocab() - val st = System.currentTimeMillis() - println("time taken to create vocab : " + (st - st1) / 1000.0) - wordEmbedding.learnEmbeddings() - val en = System.currentTimeMillis() - st - println("time taken to learn embeddings : " + en / 1000.0) - val st2 = System.currentTimeMillis() - wordEmbedding.store() - val en1 = System.currentTimeMillis() - st2 - println("time taken to store embeddings :" + en1 / 1000.0) - - } -} -*/ \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/nlp/hcoref/CanopyPairGenerator.scala b/src/main/scala/cc/factorie/app/nlp/hcoref/CanopyPairGenerator.scala deleted file mode 100644 index c230006..0000000 --- a/src/main/scala/cc/factorie/app/nlp/hcoref/CanopyPairGenerator.scala +++ /dev/null @@ -1,135 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.app.nlp.hcoref - -import cc.factorie.infer.{Proposal, SettingsSampler} - -import scala.annotation.tailrec -import scala.collection.mutable -import scala.util.Random - -/** - * @author John Sullivan - */ -trait Canopy { - def canopies:Iterable[String] -} - -trait SingularCanopy extends Canopy { - final def canopies = Seq(canopy) - - def canopy:String -} - -private[hcoref] class EntityPairGenHashSet[Vars <: NodeVariables[Vars]](mentionSize:Int)(implicit random:Random) extends mutable.HashSet[Node[Vars]] { - override def initialSize = (mentionSize * 1.5).toInt - - def sample:Node[Vars] = { - var cand = table(random.nextInt(table.length)) - while(cand == null ) { - cand = table(random.nextInt(table.length)) - } - cand.asInstanceOf[Node[Vars]] - } -} - -trait CanopyPairGenerator[Vars <: NodeVariables[Vars] with Canopy] extends PairGenerator[Vars] { - this:SettingsSampler[(Node[Vars], Node[Vars])] => - - protected var canopies = new mutable.HashMap[String,EntityPairGenHashSet[Vars]]() - var entities = new EntityPairGenHashSet[Vars](mentions.size) - var nonexistentEnts = new mutable.HashSet[Node[Vars]] - mentions foreach addEntity - - proposalHooks += {proposal:Proposal[(Node[Vars], Node[Vars])] => - val iter = proposal.diff.iterator - while(iter.hasNext) { - val diff = iter.next() - if(diff.variable.isInstanceOf[Node[Vars]#Exists]) { - val v = diff.variable.asInstanceOf[Node[Vars]#Exists] - val newValue = v.booleanValue - diff.undo() - val oldValue = v.booleanValue - diff.redo() - if(newValue != oldValue) { - if(newValue) { - addEntity(v.node) - } else { - nonexistentEnts += v.node - } - } - } - } - } - - def addEntity(e:Node[Vars]):Unit ={ - entities += e - val iter = e.variables.canopies.iterator - while(iter.hasNext) { - val canopy = iter.next() - val canopyEnts = canopies.getOrElse(canopy, new EntityPairGenHashSet[Vars](5)) - canopyEnts += e - canopies(canopy) = canopyEnts - } - } - - def nextEntityPair:(Node[Vars],Node[Vars]) = { - val e1 = getEntity(null) - val e2 = getEntity(e1) - e1 -> e2 - } - - def nextContext = nextEntityPair - - @tailrec - private def getEntity(context:Node[Vars]):Node[Vars] = if(context != null) { - val nodeCanopies = context.variables.canopies.toSeq - - val candidates = canopies(nodeCanopies(random.nextInt(nodeCanopies.size))) - - if(candidates.size <= 1) { - getEntity(null) - } else { - var e = candidates.sample - var i = 0 - while(!e.exists) { - i += 1 - e = candidates.sample - if(i % 5 == 0) { - cleanEntities() - } - } - e - } - } else { - var e = entities.sample - var i = 0 - while(!e.exists) { - i += 1 - e = entities.sample - if(i % 5 == 0) { - cleanEntities() - } - } - e - } - - private def cleanEntities(): Unit = { - val iter = nonexistentEnts.iterator - while(iter.hasNext) { - entities remove iter.next() - } - nonexistentEnts = new mutable.HashSet[Node[Vars]] - } - -} diff --git a/src/main/scala/cc/factorie/app/nlp/hcoref/CorefModel.scala b/src/main/scala/cc/factorie/app/nlp/hcoref/CorefModel.scala deleted file mode 100644 index 3b3f782..0000000 --- a/src/main/scala/cc/factorie/app/nlp/hcoref/CorefModel.scala +++ /dev/null @@ -1,57 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.app.nlp.hcoref - -import cc.factorie._ -import cc.factorie.model._ -import cc.factorie.variable.BooleanValue - -/** - * @author John Sullivan - */ -trait DebuggableModel[Vars <: NodeVariables[Vars]] extends CorefModel[Vars] { - def debugOn() {templates.collect{case t:DebuggableTemplate => t.debugOn()}} - def debugOff() {templates.collect{case t:DebuggableTemplate => t.debugOff()}} -} - -abstract class CorefModel[Vars <: NodeVariables[Vars]] extends TemplateModel with Parameters { // This is to ensure that the model's features' NodeVariables match the type of the model's NodeVariables -implicit val params:Parameters = this - - this += new StructuralPrior[Vars] -} - -class StructuralPrior[Vars <: NodeVariables[Vars]](entityvalue:Double = 0.5, - subEntityValue:Double = -0.25) - extends TupleTemplateWithStatistics3[Node[Vars]#IsRoot, Node[Vars]#IsMention, Node[Vars]#Exists] with DebuggableTemplate{ - - def name = "StructuralPrior" - def unroll1(isRoot: Node[Vars]#IsRoot) = Factor(isRoot, isRoot.node.isMentionVar, isRoot.node.existsVar) - - def unroll2(isMention: Node[Vars]#IsMention) = throw new Exception( - "is Mention changed for %s. This should never happen" format isMention.node) - - def unroll3(exists: Node[Vars]#Exists) = Factor(exists.node.isRootVar, exists.node.isMentionVar, exists) - - def score(isRoot: BooleanValue, isMention: BooleanValue, exists: BooleanValue): Double = { - if(isRoot.booleanValue && exists.booleanValue) { - report(entityvalue, 1.0) - entityvalue - } else if(!isRoot.booleanValue && !isMention.booleanValue && exists.booleanValue) { - report(subEntityValue, 1.0) - subEntityValue - } else { - report(0.0, 1.0) - 0.0 - } - } -} \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/nlp/hcoref/CorefSampler.scala b/src/main/scala/cc/factorie/app/nlp/hcoref/CorefSampler.scala deleted file mode 100644 index 15e6021..0000000 --- a/src/main/scala/cc/factorie/app/nlp/hcoref/CorefSampler.scala +++ /dev/null @@ -1,123 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.app.nlp.hcoref - -import cc.factorie.infer.{Proposal, SettingsSampler} -import cc.factorie.util.Hooks1 - -import scala.reflect.ClassTag -import scala.util.Random - -/** - * User:harshal, John Sullivan - * Date: 10/28/13 - */ -abstract class CorefSampler[Vars <: NodeVariables[Vars]](override val model:CorefModel[Vars], val mentions:Iterable[Node[Vars]], val iterations:Int)(implicit override val random:Random, val varsTag:ClassTag[Vars]) - extends SettingsSampler[(Node[Vars], Node[Vars])](model) { - this: PairGenerator[Vars] with MoveGenerator[Vars] => - - this.temperature = 0.001 - - val beforeInferHooks = new Hooks1[Unit] - protected def beforeInferHook = beforeInferHooks - val afterInferHooks = new Hooks1[Unit] - protected def afterInferHook = afterInferHooks - - def infer() { - beforeInferHook - contexts foreach process - afterInferHook - } - -} - -trait AutoStoppingAcceptSampler[Vars <: NodeVariables[Vars]] extends CorefSampler[Vars] { - this: PairGenerator[Vars] with MoveGenerator[Vars] => - - def autoStopAcceptThreshold:Int - - private var proposalIdx = 0 - private var lastRejected = 0 - private var runOfRejectedProposals = 0 - - proposalHooks += {p:Proposal[(Node[Vars], Node[Vars])] => - proposalIdx += 1 - if(p.diff.isEmpty) { // the proposal was rejected - if(proposalIdx - 1 == lastRejected) { // we rejected the last proposal as well - runOfRejectedProposals += 1 - } else { - runOfRejectedProposals = 1 - } - lastRejected = proposalIdx - } - } - - override def infer(): Unit = { - beforeInferHook - - val contextIter = contexts.toIterator - - while(contextIter.hasNext && runOfRejectedProposals < autoStopAcceptThreshold) { - process(contextIter.next()) - } - - if(proposalIdx == iterations) { - println("Stopping at max iterations of %d steps" format proposalIdx) - } else { - println("Stopping automatically after %d steps" format proposalIdx) - } - afterInferHook - } - -} - -trait AutoStoppingSampler[Vars <: NodeVariables[Vars]] extends CorefSampler[Vars] { - this: PairGenerator[Vars] with MoveGenerator[Vars] => - - def autoStopThreshold:Int - - private var runOfEmptyProposals = 0 - - - override def processProposals(props: Seq[Proposal[(Node[Vars], Node[Vars])]]) = { - if(props.size == 1) { // a proposal of size one is 'empty' because NoMove is always a valid choice - runOfEmptyProposals += 1 - } else { - runOfEmptyProposals = 0 - } - super.processProposals(props) - } - - override def infer() = { - beforeInferHook - var step = 0 - - while (step < iterations && runOfEmptyProposals < autoStopThreshold) { - process(nextContext) - step += 1 - } - println("Stopping automatically after %d steps".format(step)) - afterInferHook - } -} - -/** - * Trait for exposing proposalHooks to [[java.lang.Runnable]] - */ -trait RunnableHook[Vars <: NodeVariables[Vars]] { - this: CorefSampler[Vars] => - - def runnable:java.lang.Runnable - - proposalHooks += {_ => runnable.run()} -} \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/nlp/hcoref/DebugCoref.scala b/src/main/scala/cc/factorie/app/nlp/hcoref/DebugCoref.scala deleted file mode 100644 index 5210300..0000000 --- a/src/main/scala/cc/factorie/app/nlp/hcoref/DebugCoref.scala +++ /dev/null @@ -1,92 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.app.nlp.hcoref - -import cc.factorie.infer.Proposal - -//todo this is ugly and bad -trait Logger { - def log(s:String):Unit -} -trait PrintlnLogger extends Logger { - final def log(s:String) {println(s)} -} -trait PassedLogger extends Logger { - def _log:Logger - final def log (s:String) {_log.log(s)} -} - -/** - * @author John Sullivan - */ -trait DebugCoref[Vars <: NodeVariables[Vars]]{ - this: CorefSampler[Vars] with PairGenerator[Vars] with MoveGenerator[Vars] with Logger => - - var printEvery:Int = 10000 - - var acceptedProps = 0.0 - var acceptedThisRound = 0.0 - var totalProps = 0 - lazy val begin = System.currentTimeMillis() - var startTime = begin - var stopTime = 0L - - beforeInferHooks += { _ => - startTime = begin - } - - afterInferHooks += { _ => - debugStatement() - } - - private def debugStatement() { - - stopTime = System.currentTimeMillis() - val elapsedSecs = (stopTime - startTime) / 1000.0 - val elapsedFromBegin = (stopTime - begin) / 1000.0 - val percentAccepted = (acceptedProps / totalProps)*100 - val percentAcceptedThisRound = (acceptedThisRound / printEvery)*100 - val propsPerSec = printEvery.toDouble / elapsedSecs - val totalPropsPerSec = totalProps.toDouble / elapsedFromBegin - val depths = mentions.map(_.depth) - val maxDepth = depths.max - val minDepth = depths.min - val aveDepth = depths.sum.toDouble / depths.size - val roots = mentions.map(_.root).toSet - val rootChildrens = roots.map(_.children.size) - val rootMentions = roots.map(_.mentionCountVar.value) - val maxChildren = rootChildrens.max - val minChildren = rootChildrens.min - val aveChildren = rootChildrens.sum.toDouble / rootChildrens.size - val maxMentions = rootMentions.max - val minMentions = rootMentions.min - val aveMentions = rootMentions.sum.toDouble / rootMentions.size - log(f"After $totalProps%d proposals $percentAccepted%.2f%% ($percentAcceptedThisRound%.2f%% this round) accepted in $elapsedFromBegin%.3f secs ($totalPropsPerSec%.2f proposals/sec). This round of $printEvery%d took $elapsedSecs%.3f secs ($propsPerSec%.2f proposals/sec)") - log(f"\t max depth: $maxDepth min depth: $minDepth ave depth: $aveDepth%.2f") - log(f"\t max children: $maxChildren min children: $minChildren ave children: $aveChildren%.2f") - log(f"\t max mentions: $maxMentions min mentions: $minMentions ave mentions: $aveMentions%.2f") - startTime = stopTime - acceptedThisRound = 0.0 - } - - proposalHooks += {p:Proposal[(Node[Vars], Node[Vars])] => - totalProps +=1 - if(p.diff.nonEmpty) { - acceptedProps += 1 - acceptedThisRound += 1 - } - if(totalProps % printEvery == 0) { - debugStatement() - } - } -} diff --git a/src/main/scala/cc/factorie/app/nlp/hcoref/DebugDiffList.scala b/src/main/scala/cc/factorie/app/nlp/hcoref/DebugDiffList.scala deleted file mode 100644 index 2ed0a78..0000000 --- a/src/main/scala/cc/factorie/app/nlp/hcoref/DebugDiffList.scala +++ /dev/null @@ -1,107 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.app.nlp.hcoref - -import cc.factorie.infer.SettingsSampler -import cc.factorie.variable.SettingIterator -import cc.factorie.{DiffList, Model, TemplateModel} - -/** - * @author John Sullivan - */ -trait DebuggableTemplate { - protected var _debug: Boolean = false - def debugOn() = _debug = true - def debugOff() = _debug = false - def name: String - - /** methods implementing this trait need to call report manually during the scoring process - * to print out debug results */ - def report(score:Double, weight:Double) { - if(_debug) { - println("\t%.4f = %.4f * %.4f (score * weight) [%s]".format(score * weight, score, weight, name)) - } - } -} - -class DebugDiffList extends DiffList { - override def scoreAndUndo(model:Model): Double = { - println("scoring and undoing") - model.asInstanceOf[TemplateModel].families.collect{case t:DebuggableTemplate => t.debugOn()} - - if (this.length == 0) return 0.0 // short-cut the simple case - println("=====DEBUGGING MODEL SCORE=====") - println("----NEXT WORLD----") - var s = model.currentScore(this) - println(" next: "+ s) - //log(Log.DEBUG)("DiffList scoreAndUndo pre-undo score=" + s) - this.undo() - // We need to re-calculate the Factors list because the structure may have changed - println("----CURRENT WORLD----") - val s2 = model.currentScore(this) - println(" current: "+s2) - s -= s2 - println("TOTAL SCORE: "+s) - model.asInstanceOf[TemplateModel].families.collect{case t:DebuggableTemplate => t.debugOff()} - s - }} - -object DebugModel { - def debugOn(model:Model) { - model.asInstanceOf[TemplateModel].templates.foreach { - case debuggable:DebuggableTemplate => debuggable.debugOn() - case _ => () - } - } - - def debugOff(model:Model) { - model.asInstanceOf[TemplateModel].templates.foreach{ - case debuggable:DebuggableTemplate => debuggable.debugOff() - case _ => () - } - } -} - -trait DebugSettingIterator extends SettingIterator { - override def newDiffList = new DebugDiffList -} - -trait DebugDiffListMoveGenerator[Vars <: NodeVariables[Vars]] extends MoveGenerator[Vars]{ - this :SettingsSampler[(Node[Vars], Node[Vars])] => - - DebugModel.debugOn(model) - - def settings(c:(Node[Vars], Node[Vars])) = new DebugSettingIterator with MoveSettingIterator[Vars] { - - var (e1, e2) = c - - val moves = new scala.collection.mutable.ArrayBuffer[Move[Vars]]() - - if(e1.root != e2.root) { - if(e1.isMention && e1.isRoot && e2.isMention && e2.isRoot) { - moves += new MergeUp[Vars](e1, e2)({d => newInstance(d)}) - } else { - while (e1 != null) { - if(e1.mentionCountVar.value >= e2.mentionCountVar.value) { - moves += new MergeLeft[Vars](e1, e2) - } else { - moves += new MergeLeft[Vars](e2, e1) - } - e1 = e1.getParent.getOrElse(null.asInstanceOf[Node[Vars]]) - } - } - } - - moves += new NoMove[Vars] - } -} diff --git a/src/main/scala/cc/factorie/app/nlp/hcoref/DefaultMoveGenerator.scala b/src/main/scala/cc/factorie/app/nlp/hcoref/DefaultMoveGenerator.scala deleted file mode 100644 index 7b2b053..0000000 --- a/src/main/scala/cc/factorie/app/nlp/hcoref/DefaultMoveGenerator.scala +++ /dev/null @@ -1,62 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.app.nlp.hcoref - -import cc.factorie.infer.SettingsSampler - -/** - * @author John Sullivan - */ -trait DefaultMoveGenerator[Vars <: NodeVariables[Vars]] extends MoveGenerator[Vars]{ - this :SettingsSampler[(Node[Vars], Node[Vars])] => - - def settings(c:(Node[Vars], Node[Vars])) = new MoveSettingIterator[Vars] { - var (e1, e2) = c - - val moves = new scala.collection.mutable.ArrayBuffer[Move[Vars]]() - - if(e1.root != e2.root) { - if(e1.isMention && e1.isRoot && e2.isMention && e2.isRoot) { - moves += new MergeUp[Vars](e1, e2)({d => newInstance(d)}) - } else if(e1.isMention && e2.isMention) { - if(e1.parent != null) { - moves += new MergeLeft[Vars](e1.parent, e2) - } - if(e2.parent != null) { - moves += new MergeLeft[Vars](e2.parent, e1) - } - } else { - while (e1 != null) { - if(e1.mentionCountVar.value >= e2.mentionCountVar.value && !e1.isMention) { - moves += new MergeLeft[Vars](e1, e2) - } else { - if(e2.isMention) { // we should only be here if e2 has a parent - moves += new MergeLeft[Vars](e2.parent, e1) - } else { - moves += new MergeLeft[Vars](e2, e1) - } - } - e1 = e1.getParent.getOrElse(null.asInstanceOf[Node[Vars]]) - } - } - } else { - if(e1.mentionCountVar.value > e2.mentionCountVar.value) { - moves += new SplitRight[Vars](e2, e1) - } else { - moves += new SplitRight[Vars](e1, e2) - } - } - - moves += new NoMove[Vars] - } -} diff --git a/src/main/scala/cc/factorie/app/nlp/hcoref/DeterministicPairGenerator.scala b/src/main/scala/cc/factorie/app/nlp/hcoref/DeterministicPairGenerator.scala deleted file mode 100644 index 55a6b85..0000000 --- a/src/main/scala/cc/factorie/app/nlp/hcoref/DeterministicPairGenerator.scala +++ /dev/null @@ -1,97 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.app.nlp.hcoref - -import cc.factorie.infer.{Proposal, SettingsSampler} - -import scala.collection.mutable - -/** - * @author John Sullivan - */ -trait DeterministicPairGenerator[Vars <: NodeVariables[Vars]] extends PairGenerator[Vars]{ - this:SettingsSampler[(Node[Vars], Node[Vars])] => - - var i = 0 - def mentionSequence:IndexedSeq[(String, String, String)] - - var nextId:String = "NONE SET" - - proposalHooks += {p:Proposal[(Node[Vars], Node[Vars])] => - val (e1, e2) = p.context - e1.getParent match { - case Some(parent) => mentionMap.put(parent.uniqueId, parent) - case None => Unit - } - e2.getParent match { - case Some(parent) => mentionMap.put(parent.uniqueId, parent) - case None => Unit - } - } - - private val mentionMap = mutable.HashMap[String, Node[Vars]]() - mentionMap ++= mentions.map(m => m.uniqueId -> m) - - override def nextContext:(Node[Vars], Node[Vars]) = { - val e1 = sampleEntity - var e2 = sampleEntity - while(e1 == e2) { - e2 = sampleEntity - } - - val (e1Id, e2Id, parentId) = mentionSequence(i) - i += 1 - nextId = parentId - mentionMap.getOrElse(e1Id,sampleEntity) -> mentionMap.getOrElse(e2Id, sampleEntity) - } - - protected val _allEntities = mutable.ArrayBuffer[Node[Vars]]() - - _allEntities ++= mentions - - def addEntity(e:Node[Vars]) {_allEntities += e} - - def allEntities:Iterable[Node[Vars]] = _allEntities - - def performMaintenance { - val cleanEntities = new mutable.ArrayBuffer[Node[Vars]] - cleanEntities ++= _allEntities.filter(_.exists) - _allEntities.clear() - _allEntities ++= cleanEntities - } - - def sampleEntity:Node[Vars] = { - var tries = 5 - val numEnts = _allEntities.size - var e: Node[Vars] = null.asInstanceOf[Node[Vars]] - while({tries -=1; tries} >= 0 && (e == null || !e.exists)) { - e = _allEntities.toSeq(random.nextInt(numEnts)) - if(tries==1) { - performMaintenance - } - } - e - - } - - proposalHooks += {p:Proposal[(Node[Vars], Node[Vars])] => - val (e1, e2) = p.context - e1.getParent match { - case Some(ent) => addEntity(ent) - case None => e2.getParent match { - case Some(ent) => addEntity(ent) - case None => Unit - } - } - } -} diff --git a/src/main/scala/cc/factorie/app/nlp/hcoref/DocEntityCoref.scala b/src/main/scala/cc/factorie/app/nlp/hcoref/DocEntityCoref.scala deleted file mode 100644 index 082f041..0000000 --- a/src/main/scala/cc/factorie/app/nlp/hcoref/DocEntityCoref.scala +++ /dev/null @@ -1,75 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.app.nlp.hcoref - -import cc.factorie.app.nlp.Document -import cc.factorie.app.nlp.coref.{CrossDocEntity, WithinDocCoref} -import cc.factorie.variable.DiffList - -import scala.util.Random - -/** - * @author John Sullivan - */ -abstract class DocEntityCoref { - _settings => - def autoStopThreshold:Int - def estimateIterations(mentionCount:Int):Int - def model:CorefModel[DocEntityVars] - implicit val random:Random - - - def process(docs:Iterable[Document]):Iterable[CrossDocEntity] = { - assert(docs.forall(_.hasAnnotation(classOf[WithinDocCoref]))) - - // by mentions here we mean cross-doc mentions that correspond to within-doc entities - val mentions = docs.flatMap { doc => - doc.coref.entities.map{ winDocEntity => - new Mention[DocEntityVars](DocEntityVars.fromWithinDocEntity(winDocEntity), java.util.UUID.randomUUID.toString, winDocEntity.uniqueId)(null) - } - } - - val sampler = getSampler(mentions) - - sampler.infer - - mentions.map(_.root).toSeq - } - - - def getSampler(mentions:Iterable[Node[DocEntityVars]]) = new CorefSampler[DocEntityVars](_settings.model, mentions, _settings.estimateIterations(mentions.size)) - with AutoStoppingSampler[DocEntityVars] - with CanopyPairGenerator[DocEntityVars] - with NoSplitMoveGenerator[DocEntityVars] - with DebugCoref[DocEntityVars] - with TrainingObjective[DocEntityVars] - with PrintlnLogger { - def newInstance(implicit d: DiffList) = new Node[DocEntityVars](new DocEntityVars()) - - val autoStopThreshold = _settings.autoStopThreshold - } - -} - -class DocEntityCorefModel(namesWeights:Double, namesShift:Double, nameEntropy:Double, contextsWeight:Double, contextsShift:Double, genderWeight:Double, genderShift:Double, mentionWeight:Double, mentionShift:Double, numberWeight:Double, numberShift:Double) extends CorefModel[DocEntityVars] { - this += new ChildParentCosineDistance(namesWeights, namesShift, {v:DocEntityVars => v.names}) - this += new ChildParentCosineDistance(contextsWeight, contextsShift, {v:DocEntityVars => v.context}) - this += new ChildParentCosineDistance(genderWeight, genderShift, {v:DocEntityVars => v.nerType}) - this += new ChildParentCosineDistance(mentionWeight, mentionShift, {v:DocEntityVars => v.mention}) - this += new ChildParentCosineDistance(numberWeight, numberShift, {v:DocEntityVars => v.number}) - this += new BagOfWordsEntropy(nameEntropy, {v:DocEntityVars => v.names}) -} - -// todo code for model training -// todo train model on tac entity linking -// todo serialize and deserialize models diff --git a/src/main/scala/cc/factorie/app/nlp/hcoref/DocEntityVars.scala b/src/main/scala/cc/factorie/app/nlp/hcoref/DocEntityVars.scala deleted file mode 100644 index 2cc5e6f..0000000 --- a/src/main/scala/cc/factorie/app/nlp/hcoref/DocEntityVars.scala +++ /dev/null @@ -1,119 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.app.nlp.hcoref - -import cc.factorie.app.nlp.coref.WithinDocEntity -import cc.factorie.variable.{NoopDiff, BagOfWordsVariable, DenseDoubleBagVariable, DiffList} - -/** - * @author John Sullivan - */ -class DocEntityVars(val names:BagOfWordsVariable, val context:BagOfWordsVariable, val nerType:BagOfWordsVariable, val mention:BagOfWordsVariable, val number:BagOfWordsVariable, val truth:BagOfWordsVariable) extends NodeVariables[DocEntityVars] with Canopy with GroundTruth { - val getVariables = Seq(names, context, nerType, mention, number) - - def canopies = names.value.asHashMap.keySet - - def this() = this(new BagOfWordsVariable(), new BagOfWordsVariable(), new BagOfWordsVariable(), new BagOfWordsVariable(), new BagOfWordsVariable(), new BagOfWordsVariable()) - def this(names:BagOfWordsVariable, context:BagOfWordsVariable, gender:BagOfWordsVariable, mention:BagOfWordsVariable, number:BagOfWordsVariable) = this(names, context, gender, mention, number, new BagOfWordsVariable()) - - def --=(other: DocEntityVars)(implicit d: DiffList) = { - this.names.remove(other.names.value)(d) - this.context.remove(other.context.value)(d) - this.nerType.remove(other.nerType.value)(d) - this.mention.remove(other.mention.value)(d) - this.number.remove(other.number.value)(d) - this.truth.remove(other.truth.value)(d) - if (d ne null) d += NoopDiff(this) // I believe this is necessary because some templates have the EntityVars as its neighbor, but not the bags of words - } - - - def ++=(other: DocEntityVars)(implicit d: DiffList) = { - this.names.add(other.names.value)(d) - this.context.add(other.context.value)(d) - this.nerType.add(other.nerType.value)(d) - this.mention.add(other.mention.value)(d) - this.number.add(other.number.value)(d) - this.truth.add(other.truth.value)(d) - if (d ne null) d += NoopDiff(this) // I believe this is necessary because some templates have the EntityVars as its neighbor, but not the bags of words - } - - def --(other: DocEntityVars)(implicit d: DiffList) = new DocEntityVars(this.names -- other.names, this.context -- other.context, this.nerType -- other.nerType, this.mention -- other.mention, this.number -- other.number, this.truth -- other.truth) - - def ++(other: DocEntityVars)(implicit d: DiffList) = new DocEntityVars(this.names ++ other.names, this.context ++ other.context, this.nerType ++ other.nerType, this.mention ++ other.mention, this.number ++ other.number, this.truth ++ other.truth) -} - - -class DenseDocEntityVars(val names:BagOfWordsVariable, val context:BagOfWordsVariable, val nerType:BagOfWordsVariable, val contextVec:DenseDoubleBagVariable, val number:BagOfWordsVariable, val truth:BagOfWordsVariable) extends NodeVariables[DenseDocEntityVars] with Canopy with GroundTruth { - val getVariables = Seq(names, context, nerType, contextVec, number) - - def canopies = names.value.asHashMap.keySet - - def this() = this(new BagOfWordsVariable(), new BagOfWordsVariable(), new BagOfWordsVariable(), new DenseDoubleBagVariable(50), new BagOfWordsVariable(), new BagOfWordsVariable()) - def this(names:BagOfWordsVariable, context:BagOfWordsVariable, nerType:BagOfWordsVariable, contextVec:DenseDoubleBagVariable, number:BagOfWordsVariable) = this(names, context, nerType, contextVec, number, new BagOfWordsVariable()) - - def --=(other: DenseDocEntityVars)(implicit d: DiffList) = { - this.names.remove(other.names.value)(d) - this.context.remove(other.context.value)(d) - this.nerType.remove(other.nerType.value)(d) - this.contextVec.remove(other.contextVec.value)(d) - this.number.remove(other.number.value)(d) - this.truth.remove(other.truth.value)(d) - if (d ne null) d += NoopDiff(this) // I believe this is necessary because some templates have the EntityVars as its neighbor, but not the bags of words - } - - - def ++=(other: DenseDocEntityVars)(implicit d: DiffList) = { - this.names.add(other.names.value)(d) - this.context.add(other.context.value)(d) - this.nerType.add(other.nerType.value)(d) - this.contextVec.add(other.contextVec.value)(d) - this.number.add(other.number.value)(d) - this.truth.add(other.truth.value)(d) - if (d ne null) d += NoopDiff(this) // I believe this is necessary because some templates have the EntityVars as its neighbor, but not the bags of words - } - - def --(other: DenseDocEntityVars)(implicit d: DiffList) = new DenseDocEntityVars(this.names -- other.names, this.context -- other.context, this.nerType -- other.nerType, this.contextVec -- other.contextVec, this.number -- other.number, this.truth -- other.truth) - - def ++(other: DenseDocEntityVars)(implicit d: DiffList) = new DenseDocEntityVars(this.names ++ other.names, this.context ++ other.context, this.nerType ++ other.nerType, this.contextVec ++ other.contextVec, this.number ++ other.number, this.truth ++ other.truth) -} - -object DocEntityVars { - def fromWithinDocEntity(e:WithinDocEntity):DocEntityVars = { - val nameBag = new BagOfWordsVariable() - val contextBag = new BagOfWordsVariable() - val nerBag = new BagOfWordsVariable() - val mentionBag = new BagOfWordsVariable() - val numberBag = new BagOfWordsVariable() - - e.mentions.foreach { mention => - contextBag ++= mention.phrase.contextWindow(5).groupBy(_.lemmaString).mapValues(_.size.toDouble) - //nameBag += mention.phrase.string - //todo filter nominal mentions - nameBag ++= mention.phrase.tokens.map(_.string) - Option(mention.phrase.head.nerTag) match { - case Some(tag) => nerBag += tag.baseCategoryValue - case None => () - } - Option(mention.phrase.number) match { - case Some(number) => numberBag += number.categoryValue - case None => () - } - } - // each other entity in the document - e.document.coref.entities.filterNot(_.uniqueId == e.uniqueId).foreach { entity => - mentionBag += entity.canonicalName - } - new DocEntityVars(nameBag, contextBag, nerBag, mentionBag, numberBag) - } -} - diff --git a/src/main/scala/cc/factorie/app/nlp/hcoref/HierarchicalCorefSampler.scala b/src/main/scala/cc/factorie/app/nlp/hcoref/HierarchicalCorefSampler.scala deleted file mode 100644 index 7a80d6a..0000000 --- a/src/main/scala/cc/factorie/app/nlp/hcoref/HierarchicalCorefSampler.scala +++ /dev/null @@ -1,24 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.app.nlp.hcoref - -import scala.reflect.ClassTag -import scala.util.Random - -/** - * @author John Sullivan - */ -abstract class HierarchicalCorefSampler[Vars <: NodeVariables[Vars] with Canopy](model :CorefModel[Vars], mentions:Iterable[Node[Vars]], iterations:Int)(implicit random:Random, ct:ClassTag[Vars]) - extends CorefSampler[Vars](model, mentions, iterations)(random, ct) - with DefaultMoveGenerator[Vars] - with CanopyPairGenerator[Vars] \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/nlp/hcoref/LinkingScorer.scala b/src/main/scala/cc/factorie/app/nlp/hcoref/LinkingScorer.scala deleted file mode 100644 index 9b746cb..0000000 --- a/src/main/scala/cc/factorie/app/nlp/hcoref/LinkingScorer.scala +++ /dev/null @@ -1,38 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.app.nlp.hcoref - -/** - * @author John Sullivan - */ -object LinkingScorer { - - def scoreString(pred:Map[String,Set[String]], gold:Map[String,Set[String]]):String = { - assert(gold.keySet == pred.keySet) - assert(gold.values.flatten.toSet == pred.values.flatten.toSet) - - val microDenom = gold.values.map(_.size).sum.toDouble - val microNum = gold.map{ case(entId, goldMentions) => - pred(entId).intersect(goldMentions).size - }.sum - val microAccuracy = microNum/microDenom - - val macroDenom = gold.keySet.size - val macroNum = gold.map{ case(entId, goldMentions) => - pred(entId).intersect(goldMentions).size / goldMentions.size.toDouble - }.sum - - val macroAccuracy = macroNum/macroDenom - "P(mentions,entites) %d %d\nT(mentions,entities) %d %d\nmacro accuracy: %.4f\nmicro accuracy: %.4f".format(pred.values.map(_.size).sum, pred.keySet.size, gold.values.map(_.size).sum, gold.keySet.size,macroAccuracy, microAccuracy) - } -} diff --git a/src/main/scala/cc/factorie/app/nlp/hcoref/Move.scala b/src/main/scala/cc/factorie/app/nlp/hcoref/Move.scala deleted file mode 100644 index c474675..0000000 --- a/src/main/scala/cc/factorie/app/nlp/hcoref/Move.scala +++ /dev/null @@ -1,124 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.app.nlp.hcoref - -import cc.factorie.infer.SettingsSampler -import cc.factorie.variable.{DiffList, SettingIterator} - -/** - * User: escher, John Sullivan, akobren - * Date: 10/28/13 - * - */ -trait MoveSettingIterator[Vars <: NodeVariables[Vars]] extends SettingIterator{ - def moves:IndexedSeq[Move[Vars]] - - var i = 0 - - def hasNext = i < moves.size - def next(diff:DiffList) = {val d = newDiffList; moves(i).perform(d); i += 1; d} - def reset = i = 0 -} - - -trait MoveGenerator[Vars <: NodeVariables[Vars]] { - - this: SettingsSampler[(Node[Vars], Node[Vars])] => - - def newInstance(implicit d:DiffList):Node[Vars] -} - -trait Move[Vars <: NodeVariables[Vars]] { - - def name: String - - def perform(d:DiffList):Unit - - def isSymmetric(node1:Node[Vars], node2:Node[Vars]): Boolean // is the move symmetric for this pair of nodes? - - def isValid(node1: Node[Vars], node2:Node[Vars]): Boolean - def operation(node1: Node[Vars], node2:Node[Vars])(d:DiffList): DiffList - final def apply(node1:Node[Vars], node2:Node[Vars])(d:DiffList):DiffList = Option(d) match { - case Some(diff) => operation(node1, node2)(diff) - case None => operation(node1, node2)(new DiffList) - } -} - -class NoMove[Vars <: NodeVariables[Vars]] extends Move[Vars] { - def name = "No Move" - - def perform(d:DiffList) = Unit - - def isSymmetric(node1: Node[Vars], node2: Node[Vars]): Boolean = true - - def isValid(node1: Node[Vars], node2: Node[Vars]): Boolean = true - def operation(node1: Node[Vars], node2: Node[Vars])(d:DiffList) = { - d - } -} - -class MergeLeft[Vars <: NodeVariables[Vars]](val left:Node[Vars], val right:Node[Vars]) extends Move[Vars] { - - def this() = this(null.asInstanceOf[Node[Vars]], null.asInstanceOf[Node[Vars]]) - - def perform(d:DiffList) { - operation(right, left)(d) - } - - def name = "Merge Left" - def isValid(right: Node[Vars], left: Node[Vars]) = right.root != left.root && !left.isMention && left.mentionCountVar.value >= right.mentionCountVar.value - def isSymmetric(node1: Node[Vars], node2: Node[Vars]): Boolean = false - - def operation(right: Node[Vars], left: Node[Vars])(d:DiffList) = { - right.alterParent(Option(left))(d) - d - } -} - -class SplitRight[Vars <: NodeVariables[Vars]](val left:Node[Vars], val right:Node[Vars]) extends Move[Vars] { - - def this() = this(null.asInstanceOf[Node[Vars]], null.asInstanceOf[Node[Vars]]) - - def perform(d:DiffList) { - operation(right, left)(d) - } - - def name = "Split Right" - def isValid(right: Node[Vars], left: Node[Vars]): Boolean = left.root == right.root && right.mentionCountVar.value >= left.mentionCountVar.value - def isSymmetric(node1: Node[Vars], node2: Node[Vars]): Boolean = false - - def operation(right:Node[Vars], left: Node[Vars])(d:DiffList) = { - right.alterParent(None)(d) - d - } -} - -class MergeUp[Vars <: NodeVariables[Vars]](val left:Node[Vars], val right:Node[Vars])(newInstance:(DiffList => Node[Vars])) extends Move[Vars] { - - def this(newInstance:(DiffList => Node[Vars])) = this(null.asInstanceOf[Node[Vars]], null.asInstanceOf[Node[Vars]])(newInstance) - - def perform(d:DiffList) { - operation(right, left)(d) - } - - def name = "Merge Up" - def isValid(right: Node[Vars], left: Node[Vars]): Boolean = left.root != right.root && (left.isRoot && right.isRoot) && (left.isMention && right.isMention) - def isSymmetric(node1: Node[Vars], node2: Node[Vars]): Boolean = true - - def operation(right: Node[Vars], left: Node[Vars])(d:DiffList) = { - val newParent = newInstance(d) - right.alterParent(Some(newParent))(d) - left.alterParent(Some(newParent))(d) - d - } -} diff --git a/src/main/scala/cc/factorie/app/nlp/hcoref/NoSplitMoveGenerator.scala b/src/main/scala/cc/factorie/app/nlp/hcoref/NoSplitMoveGenerator.scala deleted file mode 100644 index 3fe3d2d..0000000 --- a/src/main/scala/cc/factorie/app/nlp/hcoref/NoSplitMoveGenerator.scala +++ /dev/null @@ -1,57 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.app.nlp.hcoref - -import cc.factorie.infer.SettingsSampler - -/** - * @author John Sullivan - */ -trait NoSplitMoveGenerator[Vars <: NodeVariables[Vars]] extends MoveGenerator[Vars]{ - this :SettingsSampler[(Node[Vars], Node[Vars])] => - - def settings(c:(Node[Vars], Node[Vars])) = new MoveSettingIterator[Vars] { - - var (e1, e2) = c - - val moves = new scala.collection.mutable.ArrayBuffer[Move[Vars]]() - - if(e1.root != e2.root) { - if(e1.isMention && e1.isRoot && e2.isMention && e2.isRoot) { - moves += new MergeUp[Vars](e1, e2)({d => newInstance(d)}) - } else if(e1.isMention && e2.isMention) { - if(e1.parent != null) { - moves += new MergeLeft[Vars](e1.parent, e2) - } - if(e2.parent != null) { - moves += new MergeLeft[Vars](e2.parent, e1) - } - } else { - while (e1 != null) { - if(e1.mentionCountVar.value >= e2.mentionCountVar.value && !e1.isMention) { - moves += new MergeLeft[Vars](e1, e2) - } else { - if(e2.isMention) { // we should only be here if e2 has a parent - moves += new MergeLeft[Vars](e2.parent, e1) - } else { - moves += new MergeLeft[Vars](e2, e1) - } - } - e1 = e1.getParent.getOrElse(null.asInstanceOf[Node[Vars]]) - } - } - } - - moves += new NoMove[Vars] - } -} \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/nlp/hcoref/Node.scala b/src/main/scala/cc/factorie/app/nlp/hcoref/Node.scala deleted file mode 100644 index 2cd8481..0000000 --- a/src/main/scala/cc/factorie/app/nlp/hcoref/Node.scala +++ /dev/null @@ -1,328 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.app.nlp.hcoref - -import cc.factorie.Cubbie -import cc.factorie.app.nlp.coref.{CrossDocEntity, CrossDocMention} -import cc.factorie.util.Hooks1 -import cc.factorie.variable._ - -import scala.annotation.tailrec -import scala.collection.mutable - -/** - * @author harshal, Jack Sullivan - * @date: 10/3/13 - */ - -/** The node class is a generic class representing the nodes of a tree - * designed to be used with the hierarchical coref inference procedure - * - * The idea behind this procedure is that we build up a tree of nodes by - * randomly sampling from the nodes in the tree and proposing to move them - * to another part of the tree (or as a part of a new tree). The decision - * of whether to accept or reject a proposal is defined by the sampler; users - * have the ability to attach arbitrary NodeVariables to a node; these NodeVariables - * represent attributes on the nodes; after a move, the nodes may propogate their - * attributes to their parents after being moved (user specified) - */ -@SerialVersionUID(1l) -class Node[Vars <: NodeVariables[Vars]](val variables:Vars, val uniqueId: String)(implicit d: DiffList) extends CrossDocEntity with Serializable { - - - override def hashCode = Option(uniqueId).map(_.hashCode).getOrElse(super.hashCode) - override def equals(a:Any) = a match { - case other:Node[Vars] if Option(this.uniqueId).isDefined && Option(other.uniqueId).isDefined && this.variables.getClass == other.variables.getClass => other.uniqueId == this.uniqueId - case otw => super.equals(a) - } - - - def mentions = leaves - - type ParentType = Node[Vars] - - def this(variables:Vars)(implicit d:DiffList) = this(variables, java.util.UUID.randomUUID.toString)(d) - - variables.node = this - - class UnrepresentedChildren(initVal:Int=0) extends IntegerVariable(0) { - val node = Node.this - if(initVal > 0) this.set(initVal)(d) - def inc(implicit diff:DiffList) = this.set(this.value + 1)(diff) - def dec(implicit diff:DiffList) = this.set(this.value - 1)(diff) - } - - class MentionCount(initVal:Int=0) extends IntegerVariable(0) { - val node = Node.this - if(initVal > 0) this.set(initVal)(d) - } - class Mentions extends SetVariable[Mention[Vars]]{ - val node = Node.this - } - class Children extends SetVariable[Node[Vars]]{ - val node = Node.this - } - class IsRoot(initVal:Boolean = parentRef.dst == null)(implicit d:DiffList) extends BooleanVariable(false) { - val node = Node.this - if(initVal) this.set(initVal)(d) - } - class IsMention(initVal:Boolean) extends BooleanVariable(initVal){ - val node = Node.this - } - class Exists(initVal:Boolean) extends BooleanVariable(false) { - val node = Node.this - if(initVal) this.set(initVal)(d) - } - - private val parentRef = new ArrowVariable[Node[Vars], Node[Vars]](this, null) - val childrenVar:Children = new Children - val mentionsVar:Mentions = new Mentions - val mentionCountVar:MentionCount = new MentionCount - val isRootVar:IsRoot = new IsRoot - val isMentionVar:IsMention = new IsMention(false) - val existsVar:Exists = new Exists(true) - - - def children = childrenVar.value - def isMention = isMentionVar.booleanValue - def isRoot = isRootVar.booleanValue - def exists = existsVar.booleanValue - - def descendents:Iterable[Node[Vars]] = { - this.children ++ this.children.collect { - case n if n.children.nonEmpty => n.descendents - case n if n.children.isEmpty => Nil - }.flatten - } - - def parent = getParent.getOrElse(null.asInstanceOf[Node[Vars]]) - final def getParent:Option[Node[Vars]] = Option(parentRef.dst) - def leaves:Iterable[Mention[Vars]] = mentionsVar.value - - @tailrec - final def root:Node[Vars] = getParent match { - case Some(p) => p.root - case None => this - } - @tailrec - final def ancestor(numGenerationsBack:Int):Node[Vars] = { - if(numGenerationsBack == 0){ - this - } else if(numGenerationsBack > this.depth -1) { - throw new IllegalArgumentException("Cannot go back deeper than depth (%d) received: %d".format(this.depth -1, numGenerationsBack)) - } else { - this.getParent.get.ancestor(numGenerationsBack -1) - } - } - - - final def lineage:Iterable[Node[Vars]] = { - def lineageHelper(pOpt:Option[Node[Vars]], parents:List[Node[Vars]]):List[Node[Vars]] = { - pOpt match { - case Some(p) => lineageHelper(p.getParent, p :: parents) - case None => parents - } - } - lineageHelper(Some(this),Nil) - } - - def size:Int = 1 + children.map(_.size).sum //todo Make Faster! - - final def depth:Int = { - @tailrec - def helper(pOpt:Option[Node[Vars]], cDepth:Int):Int = { - pOpt match { - case Some(p) => helper(p.getParent, cDepth + 1) - case None => cDepth - } - } - helper(getParent,1) - } - - def markForDeletion { - parentRef.set(null)(null) - deleteHooks(this) - } - - val deleteHooks = new Hooks1[Node[Vars]] - - - val loadedFromDb = false - protected val deletionRecord:mutable.HashSet[String] = null - - protected def deleteHook[N <: Node[_]](node: N) { - if(node.loadedFromDb) { - deletionRecord add node.uniqueId - } - } - deleteHooks.append(deleteHook) - - @tailrec - private def propagateAddition(addedVariables:Vars)(implicit d:DiffList):Unit = this.getParent match { - case Some(p) => { - p.variables ++= addedVariables - p.propagateAddition(addedVariables)(d) - } - case None => Unit - } - - @tailrec - private def propagateRemoval(removedVariables:Vars)(implicit d:DiffList):Unit = this.getParent match { - case Some(p) => { - p.variables --= removedVariables - p.propagateRemoval(removedVariables)(d) - } - case None => Unit - } - - final def alterParent(newParent:Option[Node[Vars]])(implicit d :DiffList) { - val oldParent = this.getParent - oldParent match { - case Some(oParent) => { - propagateRemoval(this.variables)(d) - oParent.childrenVar.remove(this)(d) - Node.propagateUpdateMentionCount(-this.mentionCountVar.value, oldParent)(d) - Node.propagateRemoveMentions(this.mentionsVar.value, oldParent)(d) - // TODO AK: With the current moves, a move cannot make an old parent into a root, but if there ever were, we'd need a way to add that change to the DiffList - } - case None => Unit - } - newParent match { - case Some(ment) if ment.isMention => - throw new IllegalStateException("We should never be making a mention a parent, but we tried to make %s %s's parent".format(ment, this)) - case Some(nParent) => { - parentRef.set(nParent)(d) - propagateAddition(this.variables)(d) - if(this.isRoot) { - isRootVar.set(false)(d) - } - nParent.childrenVar.add(this)(d) - Node.propagateUpdateMentionCount(this.mentionCountVar.value, newParent)(d) - Node.propagateAddMentions(this.mentionsVar.value, newParent)(d) - } - case None => { - parentRef.set(null)(d) - isRootVar.set(true)(d) - if(this.childrenVar.value.size <= 1 && !isMention) {// if we no longer have a parent, we aren't a mention, and we have one or fewer children, we no longer exist. - existsVar.set(false)(d) - } - } - } - if(oldParent.isDefined && oldParent.get.leaves.size == 0) { // This make the old parent garbage collectible - oldParent.get.alterParent(None)(d) - } - if(oldParent.isDefined && oldParent.get.children.size == 1) { - oldParent.get.children.head.alterParent(oldParent.get.getParent)(d) - } - } - - override def toString = s"Node($uniqueId, $variables)" -} - -object Node { - @tailrec - protected def propagateRemoveMentions[Vars <: NodeVariables[Vars]](mentionVar: Node[Vars]#Mentions#Value, parentRef:Option[Node[Vars]])(implicit d:DiffList):Unit = parentRef match { - case Some(p) => { - p.mentionsVar.removeAll(mentionVar)(d) - propagateRemoveMentions(mentionVar,p.getParent)(d) - } - case None => Unit - } - - @tailrec - protected def propagateAddMentions[Vars <: NodeVariables[Vars]](mentionVar: Node[Vars]#Mentions#Value, parentRef:Option[Node[Vars]])(implicit d:DiffList):Unit = parentRef match { - case Some(p) => { - p.mentionsVar.addAll(mentionVar)(d) - propagateAddMentions(mentionVar,p.getParent)(d) - } - case None => Unit - } - @tailrec - protected def propagateUpdateMentionCount[Vars <: NodeVariables[Vars]](update: Int, parentRef:Option[Node[Vars]])(implicit d:DiffList):Unit = parentRef match { - case Some(p) => { - p.mentionCountVar.set(p.mentionCountVar.value + update)(d) - propagateUpdateMentionCount(update,p.getParent)(d) - } - case None => Unit - } -} - - -@SerialVersionUID(1l) -class Mention[Vars <: NodeVariables[Vars]](v:Vars, id: String, var withinDocEntityId:String = null)(implicit d:DiffList) extends Node[Vars](v, id)(d) with CrossDocMention { - - def entity = root - def string = this.toString - - def mark:Unit = () - - def this(variables:Vars)(implicit d:DiffList) = this(variables, java.util.UUID.randomUUID.toString)(d) - override final val size = 1 - override final val leaves = List(this) - override final val isMentionVar = new IsMention(true) - override final val mentionCountVar = new MentionCount(1) - mentionsVar += this - - override def toString = s"Mention($uniqueId, $variables)" -} - -@SerialVersionUID(1l) -trait NodeVariables[Self <: NodeVariables[Self]] extends SelfVariable[Self] with Serializable { - - this: Self => - - var node:Node[Self] = null - - def ++(other:Self)(implicit d:DiffList):Self - def --(other:Self)(implicit d:DiffList):Self - def ++=(other:Self)(implicit d:DiffList):Unit - def --=(other:Self)(implicit d:DiffList):Unit - - def getVariables: Seq[Var] - def size:Int = getVariables.size - override def toString:String = "%s(%s)".format(this.getClass.getSimpleName, getVariables.map(_.toString).mkString(", ")) -} - -trait NodeCubbie[Vars <: NodeVariables[Vars]] extends Cubbie { - - type N = Node[Vars] - val deletionSet:mutable.HashSet[String] - - val parentRef = RefSlot("parentRef", () => newNodeCubbie) - val isMention = BooleanSlot("isMention") - val canopies = StringListSlot("canopies") - val source = StringSlot("src") - - def newNode(v: Vars, id:String) = new Node(v,id)(null) { - override val loadedFromDb = true - override val deletionRecord = deletionSet - } - def newMention(v: Vars, id:String) = new Mention(v,id)(null) { - override val loadedFromDb = true - override val deletionRecord = deletionSet - } - - def newNodeCubbie : NodeCubbie[Vars] - - def fetch(v: Vars) = if(isMention.value) newMention(v, this.id.toString) else newNode(v, this.id.toString) - - def store(node: N) = { - - node.getParent match{ - case Some(n) => parentRef := n - case None => - } - isMention := node.isMention - this - } -} \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/nlp/hcoref/NodeCollection.scala b/src/main/scala/cc/factorie/app/nlp/hcoref/NodeCollection.scala deleted file mode 100644 index 49bc345..0000000 --- a/src/main/scala/cc/factorie/app/nlp/hcoref/NodeCollection.scala +++ /dev/null @@ -1,200 +0,0 @@ -///* Copyright (C) 2008-2016 University of Massachusetts Amherst. -// This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) -// http://factorie.cs.umass.edu, http://github.com/factorie -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. */ -//package cc.factorie.app.nlp.hcoref -// -////import cc.factorie.db.mongo.MongoCubbieImplicits._ -////import cc.factorie.db.mongo.{LazyCubbieConverter, MongoCubbieCollection, MutableCubbieCollection} -//import cc.factorie.util.{ArrayDoubleSeq, Cubbie} -//import cc.factorie.variable.{BagOfWordsVariable, DenseDoubleBagVariable, Var} -//import com.mongodb.DB -// -//import scala.collection.mutable -//import scala.reflect.ClassTag -// -///** -// * User: escher -// * Date: 11/2/13 -// */ -//trait NodeCollection[Vars <: NodeVariables[Vars]] { -// type N = Node[Vars] -// def +=(n :N) :Unit -// def ++=(es:Iterable[N]) :Unit -// def drop() :Unit -// def store(nodesToStore:Iterable[N]):Unit -// def nextBatch(n:Int=10) :Seq[N] -// def loadAll :Seq[N] -//} -// -//class BOWCubbie extends Cubbie{ -// val nodeId = RefSlot[NodeCubbie[_]]("nid",()=>null.asInstanceOf[NodeCubbie[_]]) //todo understand how this null works -// val word = StringSlot("w") -// val count = DoubleSlot("c") -// def fetch = this.word -> this.count -// def store(id:String, w:String, c:Double) = { -// nodeId := id -// word := w -// count := c -// this -// } -//} -// -//class DenseArrayCubbie extends Cubbie { -// val nodeId = RefSlot[NodeCubbie[_]]("nid",()=>null.asInstanceOf[NodeCubbie[_]]) //todo understand how this null works -// val arr = DoubleSeqSlot("a") -// def fetch = this.arr -// def store(id:String, arr:Array[Double]) = { -// nodeId := id -// this.arr := new ArrayDoubleSeq(arr) -// this -// } -//} -// -//trait DBNodeCollection[Vars <: NodeVariables[Vars], NC <: NodeCubbie[Vars]] extends NodeCollection[Vars]{ -// protected val _id2cubbie = mutable.HashMap[String,NC]() -// protected def newNodeCubbie :NC -// protected def newNode(v:Vars, nc:NC) = if(nc.isMention.value) { -// nc.newMention(v, nc.id.toString) -// } else { -// nc.newNode(v, nc.id.toString) -// } -// protected def cubbify(n:N) = {val nc = newNodeCubbie; nc.store(n); nc} -// protected def nodeCubbieColl :MutableCubbieCollection[NC] -// def += (n:N){ insert(n) } -// def ++=(ns:Iterable[N]){ insert(ns) } -// def insert(c:NC) { nodeCubbieColl += c } -// def insert(n:N) { nodeCubbieColl += cubbify(n) } -// def insert(ns:Iterable[N]) { nodeCubbieColl ++= ns.map(cubbify) } -// def drop:Unit -// def store(nodesToStore:Iterable[N]) { -// val (created, others) = nodesToStore.partition(n => n.exists && !n.loadedFromDb) -// nodeCubbieColl ++= created.map(cubbify) -// for(node <- others){ -// if(!node.exists){ -// nodeCubbieColl.remove(_.idIs(node.uniqueId)) -// } -// else { -// nodeCubbieColl.updateDelta(_id2cubbie(node.uniqueId),cubbify(node)) -// } -// } -// } -// -// def assembleNodes(toAssemble:Seq[N], node2ParentId: Map[N, String], id2Node:Map[String, N]) { -// -// def assembleHelper(n:N) { -// if(!n.getParent.isDefined && node2ParentId.isDefinedAt(n)) { -// val parent = id2Node(node2ParentId(n)) -// n.alterParent(Some(parent))(null) -// assembleHelper(parent) -// } -// } -// -// val mentions = toAssemble.filter(_.isMention) -// for(m <- mentions){ -// assembleHelper(m) -// } -// } -// -//} -// -//abstract class MongoNodeCollection[Vars <: NodeVariables[Vars], NC <: NodeCubbie[Vars]](val bagNames:Seq[String], val arrayNames:Seq[String], mongoDB:DB)(implicit ct: ClassTag[Vars]) extends DBNodeCollection[Vars, NC]{ -// val numBags = ct.runtimeClass.getDeclaredFields.count(_.getType.getName.endsWith("BagOfWordsVariable")) -1 -// //assert(bagNames.size == numBags+1, "Insufficient bag of words collection names : "+numBags+1+"<"+bagNames.size) -// val numArrays = ct.runtimeClass.getDeclaredFields.count(_.getType.getName.endsWith("DenseDoubleBagVarable")) -// //assert(arrayNames.size == numArrays, "Insufficient dense collection names : "+numArrays+"<"+bagNames.size) -// protected val colls = bagNames.map(mongoDB.getCollection) -// val nodeCubbieColl = new MongoCubbieCollection[NC](colls(0),() => newNodeCubbie,(a:NC) => Seq(Seq(a.parentRef))) with LazyCubbieConverter[NC] -// val varsCubbieColls = colls.tail.map(coll => new MongoCubbieCollection(coll,() => newBOWCubbie,(a:BOWCubbie) => Seq(Seq(a.nodeId))) with LazyCubbieConverter[BOWCubbie]) -// val denseCubbieColls = arrayNames.map{ arrName => -// new MongoCubbieCollection(mongoDB.getCollection(arrName),() => newDenseCubbie, (a:DenseArrayCubbie) => Seq(Seq(a.nodeId))) with LazyCubbieConverter[DenseArrayCubbie] -// } -// -// def drop: Unit = ??? -// -// def nextBatch(n: Int): Seq[N] = ??? -// -// def getTruth(nc:NC):String -// -// override def += (n:N){ -// var bowIdx = 1 -// var arrIdx = 0 -// for(v <- n.variables.getVariables){ -// v match { -// case bow:BagOfWordsVariable => -// varsCubbieColls(bowIdx) ++= cubbifyBOW(n.uniqueId, bow) -// bowIdx+=1 -// case arr:DenseDoubleBagVariable => -// denseCubbieColls(arrIdx) += new DenseArrayCubbie().store(n.uniqueId, arr.value) -// arrIdx += 1 -// case _ => println("can't cubbify this type") -// } -// } -// insert(n) -// } -// -// def cubbifyBOW(nodeId:String, bow:BagOfWordsVariable) = bow.value.asHashMap.map{ -// case (w,d) => newBOWCubbie.store(nodeId, w, d) -// } -// -// def loadAll: Seq[N] = { -// val node2ParentId = mutable.HashMap[N, String]() -// val id2Node = mutable.HashMap[String, N]() -// val nodes = -// for(nc <- nodeCubbieColl.toSeq) yield { -// _id2cubbie += nc.id.toString -> nc -// val bowVars = varsCubbieColls.map(coll => { -// val it = coll.findByAttribute("bid",Seq(nc.id)) -// val bag = new BagOfWordsVariable -// for(b <- it){ -// bag += (b.word.value, b.count.value) -// } -// bag -// }) -// val arrVars = denseCubbieColls.flatMap{ coll => //todo check that this is correct -// coll.findByAttribute("bid", Seq(nc.id)).map{cub => -// val arr = cub.arr.value.asArray -// val denseVar = new DenseDoubleBagVariable(arr.length) -// denseVar.set(arr)(null) -// denseVar -// } -// } -// val v = newNodeVars(getTruth(nc), (bowVars ++ arrVars):_*) -// val n = newNode(v,nc) -// id2Node += nc.id.toString -> n -// if(nc.parentRef.isDefined){ -// node2ParentId += n -> nc.parentRef.value.toString -// } -// n -// } -// assembleNodes(nodes, node2ParentId.toMap, id2Node.toMap) -// nodes -// } -// -// // def loadByIds(ids: Seq[String]): Seq[N] = { -// // for(nc <- nodeCubbieColl.findByIds(ids)) yield { -// // val vars = mutable.ArrayBuffer[Var]() -// // for(coll <- varsCubbieColls){ -// // val it = coll.findByAttribute("nid",Seq(nc.id)) -// // //todo create a var with the cubbies -// // } -// // val v = newNodeVars(vars:_*) -// // val n = newNode(v,n) -// // n -// // } -// // } -// -// protected def newBOWCubbie = new BOWCubbie() -// protected def newDenseCubbie = new DenseArrayCubbie() -// -// protected def newNodeVars[V <: Var](truth:String, vars: V*) : Vars -//} -// diff --git a/src/main/scala/cc/factorie/app/nlp/hcoref/NodeTemplates.scala b/src/main/scala/cc/factorie/app/nlp/hcoref/NodeTemplates.scala deleted file mode 100644 index 4baa64f..0000000 --- a/src/main/scala/cc/factorie/app/nlp/hcoref/NodeTemplates.scala +++ /dev/null @@ -1,327 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.app.nlp.hcoref - -import cc.factorie.{Parameters, _} -import cc.factorie.app.strings -import cc.factorie.la.{Tensor, Tensor1} -import cc.factorie.model._ -import cc.factorie.util.VectorUtils -import cc.factorie.variable._ - -import scala.reflect.ClassTag - -/** - * @author John Sullivan - */ -class EntitySizePrior[Vars <: NodeVariables[Vars]](val weight:Double=0.1, val exponent:Double=1.2, val saturation:Double=128) - extends TupleTemplateWithStatistics3[Node[Vars]#Exists,Node[Vars]#IsRoot,Node[Vars]#MentionCount]{ - - def unroll1(exists: Node[Vars]#Exists) = Factor(exists, exists.node.isRootVar, exists.node.mentionCountVar) - def unroll2(isRoot: Node[Vars]#IsRoot) = Factor(isRoot.node.existsVar, isRoot, isRoot.node.mentionCountVar) - def unroll3(mentionCount: Node[Vars]#MentionCount) = Factor(mentionCount.node.existsVar, mentionCount.node.isRootVar, mentionCount) - - def score(exists: Node[Vars]#Exists#Value, isRoot: Node[Vars]#IsRoot#Value, mentionCount: Node[Vars]#MentionCount#Value) = if(exists.booleanValue && isRoot.booleanValue) { - math.min(saturation, math.pow(mentionCount, exponent)) * weight - } else { - 0.0 - } -} - -class SingleBagTemplate[Vars <: NodeVariables[Vars]](initialWeight:Double, getBag:(Vars => BagOfWordsVariable), getScore:(BagOfWordsVariable => Double), val name:String)(implicit ct:ClassTag[Vars], params:Parameters) -extends Template2[Node[Vars]#Exists, Vars] -with DotFamily2[Node[Vars]#Exists, Vars] -with DebuggableTemplate { - - def unroll1(v: Node[Vars]#Exists) = Factor(v, v.node.variables) - def unroll2(v: Vars) = Factor(v.node.existsVar, v) - - override def statistics(exists: Node[Vars]#Exists#Value, vars:Vars) = if(exists.booleanValue) { - val score = getScore(getBag(vars)) - report(score, t(0)) - Tensor1(score) - } else { - report(0.0, t(0)) - Tensor1(0.0) - } - - private val t = Tensor1(initialWeight) - val _weights = params.Weights(t) - - def weights: Weights = _weights - - -} - -class BagOfWordsEntropy[Vars <: NodeVariables[Vars]](initialWeight:Double, getBag:(Vars => BagOfWordsVariable), bagName:String = "")(implicit ct:ClassTag[Vars], params:Parameters) - extends SingleBagTemplate[Vars](initialWeight, getBag, {b => - val bag = b.value - var entropy = 0.0 - var n = 0.0 - val l1Norm = bag.l1Norm - - bag.asHashMap.foreach{ case(k,v) => - val vNormAbs = math.abs(v/l1Norm) - entropy -= (vNormAbs)*math.log(vNormAbs) - n+=1.0 - } - if(n>1)entropy /= scala.math.log(n) //normalized entropy in [0,1] - if((-entropy).isNaN) 0.0 else -entropy - -entropy - }, "BagOfWordsEntropy: %s".format(bagName)) - -class RootNodeBagTemplate[Vars <: NodeVariables[Vars]](initialWeight:Double, getBag:(Vars => BagOfWordsVariable), getScore:(BagOfWordsVariable => Double), val name:String)(implicit ct:ClassTag[Vars], params:Parameters) - extends Template3[Node[Vars]#Exists,Node[Vars]#IsRoot,Vars] - with DotFamily3[Node[Vars]#Exists,Node[Vars]#IsRoot,Vars] - with DebuggableTemplate { - - def unroll1(exists: Node[Vars]#Exists) = Factor(exists, exists.node.isRootVar, exists.node.variables) - def unroll2(isRoot: Node[Vars]#IsRoot) = Factor(isRoot.node.existsVar, isRoot, isRoot.node.variables) - def unroll3(vars: Vars) = Factor(vars.node.existsVar, vars.node.isRootVar, vars) - - - override def statistics(exists: Node[Vars]#Exists#Value, isRoot: Node[Vars]#IsRoot#Value, vars: Vars) = if(exists.booleanValue && isRoot.booleanValue) { - val score = getScore(getBag(vars)) - report(score, t(0)) - Tensor1(score) - } else { - report(0.0, t(0)) - Tensor1(0.0) - } - - private val t = Tensor1(initialWeight) - val _weights = params.Weights(t) - - def weights: Weights = _weights - -} - -class BagOfWordsSizePrior[Vars <: NodeVariables[Vars]](initialWeight:Double, getBag:(Vars => BagOfWordsVariable), bagName:String = "")(implicit ct:ClassTag[Vars], params:Parameters) - extends RootNodeBagTemplate[Vars](initialWeight, getBag, {bag => if (bag.size > 0) - bag.size.toDouble / bag.value.l1Norm else 0.0}, "BagOfWordsSizePrior: %s".format(bagName)) - -class EmptyBagPenalty[Vars <: NodeVariables[Vars]](initialWeight:Double, getBag:(Vars => BagOfWordsVariable), bagName:String = "")(implicit ct:ClassTag[Vars], params:Parameters) - extends RootNodeBagTemplate[Vars](initialWeight, getBag, {bag => if (bag.size == 0) -1.0 else 0.0}, "EmptyBagPenalty: %s".format(bagName)) - -class EntityNameTemplate[Vars <: NodeVariables[Vars]](val firstLetterWeight:Double=4.0, val fullNameWeight:Double=4.0,val weight:Double=64,val saturation:Double=128.0, val penaltyOnNoName:Double=2.0, getBag:(Vars => BagOfWordsVariable), bagName:String = "")(implicit ct:ClassTag[Vars], params:Parameters) - extends TupleTemplateWithStatistics3[Node[Vars]#Exists,Node[Vars]#IsRoot,Vars] - with DebuggableTemplate { - - val name = "EntityNameTemplate: %s".format(bagName) - - def unroll1(exists: Node[Vars]#Exists) = Factor(exists, exists.node.isRootVar, exists.node.variables) - def unroll2(isRoot: Node[Vars]#IsRoot) = Factor(isRoot.node.existsVar, isRoot, isRoot.node.variables) - def unroll3(vars: Vars) = Factor(vars.node.existsVar, vars.node.isRootVar, vars) - - - override def score(exists: Node[Vars]#Exists#Value, isRoot: Node[Vars]#IsRoot#Value, vars: Vars) = { - var score = 0.0 - var firstLetterMismatches = 0 - var nameMismatches = 0 - val bag = getBag(vars) - bag.value.asHashMap.keySet.pairs.foreach { case(tokI, tokJ) => - if(tokI.charAt(0) != tokJ.charAt(0)) { - firstLetterMismatches += 1 - } - if(tokI.length > 1 && tokJ.length > 1) { - nameMismatches += tokI editDistance tokJ - } - } - score -= math.min(saturation, firstLetterMismatches * firstLetterWeight) - score -= math.min(saturation, nameMismatches * fullNameWeight) - if(bag.size == 0 && isRoot.booleanValue) { - score -= penaltyOnNoName - } - report(score, weight) - score * weight - } -} - -abstract class ChildParentTemplate[Vars <: NodeVariables[Vars]](val initWeights:Tensor1)(implicit v1:ClassTag[Vars], params:Parameters) - extends Template3[ArrowVariable[Node[Vars], Node[Vars]], Vars, Vars] - with DotFamily3[ArrowVariable[Node[Vars], Node[Vars]], Vars, Vars]{ - override def unroll1(v: ArrowVariable[Node[Vars], Node[Vars]]) = Option(v.dst) match { // If the parent-child relationship exists, we generate factors for it - case Some(dest) => Factor(v, v.src.variables, dest.variables) - case None => Nil - } - def unroll2(v: Vars) = Nil - def unroll3(v: Vars) = Nil - - val _weights = params.Weights(initWeights) - - def weights: Weights = _weights -} - -class ChildParentCosineDistance[Vars <: NodeVariables[Vars]](weight:Double, shift: Double, getBag:(Vars => BagOfWordsVariable), bagName:String = "")(implicit c:ClassTag[Vars], p:Parameters) extends ChildParentTemplate[Vars](Tensor1(weight)) with DebuggableTemplate { - val name: String = "ChildParentCosineDistance: %s".format(bagName) - - override def statistics(v1: (Node[Vars], Node[Vars]), child: Vars, parent: Vars): Tensor = { - val childBag = getBag(child) - val parentBag = getBag(parent) - val v = childBag.value.cosineSimilarity(parentBag.value, childBag.value) + shift - - report(v, initWeights(0)) - Tensor1(v) - } -} - -class ChildParentPersonNameTemplate[Vars <: NodeVariables[Vars]](weight:Double, shift:Double, getName:(Vars => PersonNameVariable), scoreNames:((PersonName, PersonName) => Double), varName:String = "")(implicit c:ClassTag[Vars], p:Parameters) extends ChildParentTemplate[Vars](Tensor1(weight)) with DebuggableTemplate { - def name = "ChildParentPersonName: %s".format(varName) - - override def statistics(v1: (Node[Vars], Node[Vars]), child: Vars, parent: Vars) = { - val childName = getName(child) - val parentName = getName(parent) - val v = scoreNames(childName.value, parentName.--(childName)(null).value) // at this point the child bag has been added to the parent and needs to be remove for regular comparison - report(v, initWeights(0)) - Tensor1(v) - } -} - -/** - * This feature serves to ensure that certain merges are forbidden. Specifically no two nodes that share the same value - * in the [[cc.factorie.variable.BagOfWordsVariable]] should be permitted to merge. Together with [[IdentityFactor]] it can create uniquely - * identifying features. - */ -class ExclusiveConstraintFactor[Vars <: NodeVariables[Vars]](getBag:(Vars => BagOfWordsVariable), bagName:String = "")(implicit ct:ClassTag[Vars]) - extends TupleTemplateWithStatistics3[ArrowVariable[Node[Vars], Node[Vars]], Vars, Vars] - with DebuggableTemplate { - val name = "ExclusiveConstraintFactor: %s".format(bagName) - - override def unroll1(v: ArrowVariable[Node[Vars], Node[Vars]]) = Option(v.dst) match { // If the parent-child relationship exists, we generate factors for it - case Some(dest) => Factor(v, v.src.variables, dest.variables) - case None => Nil - } - def unroll2(v: Vars) = Nil - def unroll3(v: Vars) = Nil - - def score(v1: (Node[Vars], Node[Vars]), child: Vars, parent: Vars) = { - val childBag = getBag(child) - val parentBag = getBag(parent) - var result = 0.0 - if((childBag.value.asHashMap.keySet & parentBag.--(childBag)(null).value.asHashMap.keySet).nonEmpty) { - result = -999999.0 - } else { - result = 0.0 - } - report(result, 1.0) - result - } -} - -class MatchConstraint[Vars <: NodeVariables[Vars]](matchScore:Double, matchPenalty:Double, getBag:(Vars => BagOfWordsVariable), bagName:String = "")(implicit ct:ClassTag[Vars], p:Parameters) - extends ChildParentTemplate[Vars](Tensor1(matchScore, matchPenalty)) - with DebuggableTemplate { - def name = "Matching Constraint on: %s".format(bagName) - - override def statistics(v1: ArrowVariable[Node[Vars], Node[Vars]]#Value, child: Vars#Value, parent: Vars#Value) = { - val x = getBag(child) - val y = (getBag(parent) -- x)(null) - if(x.value.contains(y.value)) { - Tensor1(1,0) - } else { - Tensor1(0,1) - } - } -} - -/** - * This feature serves to account for special information that may uniquely identify an entity. If a merge is proposed - * between two nodes that share a value in getBag they will be merged. This feature does not ensure that the value in - * getBag is unique, [[ExclusiveConstraintFactor]] manages that separately. - */ -class IdentityFactor[Vars <: NodeVariables[Vars]](getBag:(Vars => BagOfWordsVariable), bagName:String = "")(implicit ct:ClassTag[Vars]) - extends TupleTemplateWithStatistics3[ArrowVariable[Node[Vars], Node[Vars]], Vars, Vars] - with DebuggableTemplate { - val name = "IdentityFactor: %s".format(bagName) - - override def unroll1(v: ArrowVariable[Node[Vars], Node[Vars]]) = Option(v.dst) match { // If the parent-child relationship exists, we generate factors for it - case Some(dest) => Factor(v, v.src.variables, dest.variables) - case None => Nil - } - def unroll2(v: Vars) = Nil - def unroll3(v: Vars) = Nil - - def score(v1: (Node[Vars], Node[Vars]), child: Vars, parent: Vars) = { - val childBag = getBag(child) - val parentBag = getBag(parent) - var result = 0.0 - if(childBag.value.asHashMap.exists{case (id, _) => parentBag.value.asHashMap.contains(id)}) { - result = 9999999.0 - } else { - result = 0.0 - } - report(result, 1.0) - result - } -} - - -class ChildParentDistanceFactor[Vars <: NodeVariables[Vars]](weight:Double, shift:Double, getBag:(Vars => BagOfWordsVariable), distance:((BagOfWordsVariable, BagOfWordsVariable) => Double), metricName:String = "", elementName:String = "")(implicit ct:ClassTag[Vars], p:Parameters) - extends ChildParentTemplate[Vars](Tensor1(weight)) - with DebuggableTemplate { - - val name = "ChildParentDistance: %s-%s".format(metricName, elementName) - - override def statistics(v1: (Node[Vars], Node[Vars]), child: Vars, parent: Vars) = { - val childBag = getBag(child) - val parentBag = getBag(parent).--(childBag)(null) - //println("child: %s parent: %s distance: %.4f".format(childBag, parentBag, distance(childBag, parentBag))) - Tensor1(distance(childBag, parentBag) + shift) - } -} - -class ChildParentStringDistance[Vars <: NodeVariables[Vars]](weight:Double, shift:Double, getBag:(Vars => BagOfWordsVariable), elementName:String="")(implicit ct:ClassTag[Vars], p:Parameters) extends ChildParentDistanceFactor[Vars](weight, shift, getBag, {(x:BagOfWordsVariable, y:BagOfWordsVariable) => 1 - ( strings.editDistance(x.value.longest,y.value.longest) / math.max(x.value.longest.length, y.value.longest.length))}, "string edit distance", elementName) - -class DenseCosineDistance[Vars <: NodeVariables[Vars]](weight:Double, shift:Double, getArray:(Vars => DenseDoubleBagVariable), elementName:String="")(implicit ct:ClassTag[Vars], params:Parameters) extends ChildParentTemplate[Vars](Tensor1(weight)) with DebuggableTemplate { - - val name = "DenseCosineDistance: %s".format(elementName) - - import VectorUtils._ - override def statistics(v1: ArrowVariable[Node[Vars], Node[Vars]]#Value, child: Vars, parent: Vars) = { - val childArray = getArray(child).value - val parentArray = getArray(parent).value - val v = (childArray cosineSimilarityWithParent parentArray)+shift - if (v.toString == "Infinity"){ - println("got infinite cosine distance from:\n%s\n%s".format(childArray.map(x => "%.4f".format(x)).mkString(","), parentArray.map(x => "%.4f".format(x)).mkString(","))) - } - report(v, initWeights(0)) - Tensor1(v) - } -} - -class DenseBagOfWordsEntropy[Vars <: NodeVariables[Vars]](initialWeight:Double, getArray:(Vars => DenseDoubleBagVariable), elementName:String="")(implicit ct:ClassTag[Vars], params:Parameters) - extends Template2[Node[Vars]#Exists, Vars] - with DotFamily2[Node[Vars]#Exists, Vars] - with DebuggableTemplate { - - val name = "DenseBagOfWordsEntropy: %s".format(elementName) - - def unroll1(v: Node[Vars]#Exists) = Factor(v, v.node.variables) - def unroll2(v: Vars) = Factor(v.node.existsVar, v) - - import VectorUtils._ - - override def statistics(exists: Node[Vars]#Exists#Value, vars:Vars) = if(exists.booleanValue) { - val score = getArray(vars).value.normalizedEntropyForLogValues - report(score, t(0)) - Tensor1(score) - } else { - report(0.0, t(0)) - Tensor1(0.0) - } - - private val t = Tensor1(initialWeight) - val _weights = params.Weights(t) - def weights = _weights - -} - diff --git a/src/main/scala/cc/factorie/app/nlp/hcoref/PairGenerator.scala b/src/main/scala/cc/factorie/app/nlp/hcoref/PairGenerator.scala deleted file mode 100644 index 5d30af5..0000000 --- a/src/main/scala/cc/factorie/app/nlp/hcoref/PairGenerator.scala +++ /dev/null @@ -1,36 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.app.nlp.hcoref - -/** - * @author John Sullivan - */ -trait PairGenerator[Vars <: NodeVariables[Vars]] { - def nextContext:(Node[Vars], Node[Vars]) - def iterations:Int - def mentions:Iterable[Node[Vars]] - - def contexts:Iterable[(Node[Vars], Node[Vars])] = new Iterator[(Node[Vars], Node[Vars])] { - - var index = 0 - - def hasNext: Boolean = index < iterations - - def next(): (Node[Vars], Node[Vars]) = if(hasNext) { - index += 1 - nextContext - } else { - throw new NoSuchElementException("Max iterations exceeded %d" format iterations) - } - }.toStream -} \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/nlp/hcoref/PostSampler.scala b/src/main/scala/cc/factorie/app/nlp/hcoref/PostSampler.scala deleted file mode 100644 index 21468ca..0000000 --- a/src/main/scala/cc/factorie/app/nlp/hcoref/PostSampler.scala +++ /dev/null @@ -1,234 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.app.nlp.hcoref - -import cc.factorie._ - -import scala.collection.mutable -import scala.util.Random - -/** - * @author johnsullivan - * - * Direct Scoring Model simply exposes the cosine distance scores of a coref model as a distance function on - * nodes to aid [[cc.factorie.app.nlp.hcoref.PostSampler]] - */ -trait DirectScoringModel[Vars <: NodeVariables[Vars]] extends CorefModel[Vars] { - def scoreDistance(a:Node[Vars], b:Node[Vars]) = { - templates.collect{ case dist:ChildParentCosineDistance[Vars] => - dist.score(a -> b, a.variables, b.variables) - }.sum - } -} - -/** - * @author johnsullivan - * - * PostSampler provides a suite of methods to change the structure of the trees to work better as - * hierarchical summaries rather than coreference hypotheses - */ -trait PostSampler[Vars <: NodeVariables[Vars], Model <: DirectScoringModel[Vars]] { - this: CorefSampler[Vars] with MoveGenerator[Vars] with Logger => - - implicit val random:Random - - def scoreDist(n1:Node[Vars], n2:Node[Vars]) = model.asInstanceOf[Model].scoreDistance(n1, n2) - - def postSample: Unit = { - var branches = mentions.flatMap(_.getParent).toSet.toSeq //mentions.collect {case m if !m.isMention && m.getParent.isDefined => m.parent}.toSet.toSeq - val orphans = mentions.filter(m => m.isMention && m.getParent.isEmpty) - log("About to slot %d orphans into %d branches".format(orphans.size, branches.size)) - val threshold = (branches.size * orphans.size) / 10 //never look at more than 10% of the possible size - - var idx = 0 - val oIter = orphans.iterator - while(oIter.hasNext) { - var curIdx = 0 - var maxScore = Double.MinValue - var maxIdx = -1 - var maxl1 = Double.MinValue - var maxl2 = Double.MinValue - val candidates = mutable.ArrayBuffer[Node[Vars]]() - val orphan = oIter.next() - val bIter = branches.iterator - while(bIter.hasNext && (!(maxScore > 0.0 && maxl1 > 0.0 && maxl2 > 0.0) || !(candidates.length > threshold))) { - val branch = bIter.next() - val score = scoreDist(orphan, branch) - candidates += branch - - if(score > maxScore) { - maxIdx = curIdx - maxl2 = maxl1 - maxl1 = maxScore - maxScore = score - } else if(score > maxl1) { - maxl2 = maxl1 - maxl1 = score - } else if(score > maxl2) { - maxl2 = score - } - curIdx += 1 - } - - idx += 1 - if(candidates.nonEmpty) { - orphan.alterParent(Some(candidates(maxIdx)))(null) - } - branches = branches.shuffle - } - if(mentions.count(m => m.isMention && m.getParent.isEmpty) > 0) { - log("At this point we should have no more mentions but we have " + mentions.count(m => m.isMention && m.getParent.isEmpty)) - } - log("done") - - } - - - - def retryMentions: Unit = { - var orphans = mentions.filter(m => m.isMention && m.getParent.isEmpty).toSeq - log("trying merger on %d mentions".format(orphans.size)) - val minScore = -1.5 - var scoreThresh = 0.0 - var remainingMents = orphans.size - var remainingM1 = -1 - while (orphans.nonEmpty && !((scoreThresh == minScore) && remainingM1 == remainingMents)) { - val threshold = orphans.size / 10 - val candidates = - for(i <- orphans.indices; - j <- i + 1 until threshold; - n1 = orphans(i); - n2 = orphans(j); - score = scoreDist(n1, n2) - if score > scoreThresh) yield { - (n1, n2, score) - } - candidates.sortBy(-_._3).headOption match { - case Some((n1, n2, _)) => - new MergeUp[Vars](n1,n2)({d:DiffList => newInstance(d)}).perform(null) - case None => - scoreThresh = math.max(minScore, scoreThresh - 0.1) - } - - remainingM1 = remainingMents - orphans = mentions.filter(m => m.isMention && m.getParent.isEmpty).toSeq - remainingMents = orphans.size - } - log("done trying to merge mentions with %d mentions left and a score threshold of %.2f".format(orphans.size, scoreThresh)) - } - - def getScoreMatrix(ns:Seq[Node[Vars]], threshold:Int = 10):Seq[(Node[Vars], Node[Vars], Double)] = - (for(i <- ns.indices; - j <- i + 1 until math.min(ns.size, threshold); - n1 = ns(i); - n2 = ns(j); - s = scoreDist(n1, n2)) yield {(n1, n2, s)}).sortBy(_._3) - - def dropInRoots: Unit = { - val roots = mentions.map(_.root).filterNot(_.isMention).toSeq - log("dropping in %d roots".format(roots.size)) - val scoreMat = getScoreMatrix(roots, 50) - val mergedRoots = mutable.HashSet[Node[Vars]]() - scoreMat.takeRight(50).foreach { case (n1, n2, _) => - if((!mergedRoots.contains(n1) || !mergedRoots.contains(n2)) && !(n1 == n2)) { - if(n1.children.size > n2.children.size && !mergedRoots.contains(n1)) { - mergedRoots += n2 - new MergeLeft[Vars](n1, n2).perform(null) - } else { - mergedRoots += n1 - new MergeLeft[Vars](n2, n1).perform(null) - } - } - } - log("\nDone dropping roots, now we have %d roots".format(mentions.map(_.root).toSet.size)) - } - - def internalReshuffle: Unit = { - val threshold = mentions.size / 10 - val roots = mentions.collect {case m if !m.root.isMention => m.root}.toSet.toSeq - log("Started reshuffle") - def processNode(node:Node[Vars]): Unit = { - implicit val diff:DiffList = null - if(node.children.nonEmpty && node.children.size >= threshold) { - val children = node.children.toSeq - val sub1 = newInstance(null) - val sub2 = newInstance(null) - sub1 alterParent Some(node) - sub2 alterParent Some(node) - - val scores = getScoreMatrix(children).map{case(a,b,c) => (a,b) -> c} - val ((n1, n2), _) = scores.head - n1 alterParent Some(sub1) - n2 alterParent Some(sub2) - val relScoreMap = mutable.HashMap[Node[Vars], (Double, Double)]().withDefault(_ => (Double.NaN, Double.NaN)) - for(((c1, c2), sc) <- scores.tail - if (c1 == n1 || c1 == n2 || c2 == n1 || c2 == n2) && !(n1==c1 && n2==c2)) { // if we have a pair that has one of n1/n2, but not both - if(c1 == n1) { - val (_, s2) = relScoreMap(c2) - relScoreMap(c2) = sc -> s2 - } else if(c1 == n2) { - val (s1, _) = relScoreMap(c2) - relScoreMap(c2) = s1 -> sc - } else if(c2 == n1) { - val (_, s2) = relScoreMap(c1) - relScoreMap(c1) = sc -> s2 - } else if(c2 == n2) { - val (s1, _) = relScoreMap(c1) - relScoreMap(c1) = s1 -> sc - } else { - throw new IllegalStateException("c1: %s c2: %s n1: %s n2: %s sc:%.2f".format(c1, c2, n1, n2, sc)) - } - } - relScoreMap.foreach { case(n, (s1, s2)) => - assert(!s1.isNaN) - assert(!s2.isNaN) - n alterParent Some(if (s1 > s2) sub1 else sub2) - } - - log("\tafter reshuffle we have two nodes of size %d and %d".format(sub1.children.size, sub2.children.size)) - sub1.children foreach processNode - sub2.children foreach processNode - } - } - roots foreach processNode - log("finished reshuffle") - } - - def semiBIRCH: Unit = { - val roots = mentions.roots.sortBy(- _.mentions.size) - val numMents = roots.foldLeft(0)(_ + _.mentions.size) - val cutoff = 0.7 - - var mentCnt = 0 - val (bigRoots, smallRoots) = roots.foldLeft((Seq.empty[Node[Vars]], Seq.empty[Node[Vars]])) { case ((bigNodes, rest), r) => - if(mentCnt < (numMents * cutoff)) { - mentCnt += r.mentions.size - (bigNodes :+ r) -> rest - } else { - bigNodes -> (rest :+ r) - } - } - def dropNode(candidates:Seq[Node[Vars]], node:Node[Vars]): Unit = { - val (s, newParent) = (candidates.map(c => scoreDist(c, node) -> c) ++ node.getParent.map(p => scoreDist(p, node) -> p)).sortBy(-_._1).head - log ("giving %s to %s as a child with score %.4f".format(node, newParent, s)) - if(newParent != node.parent) { - node.alterParent(Some(newParent))(null) - dropNode(newParent.children.filterNot(c => c.isMention || c == node).toSeq, node) - } else { - log("%s settled down with %s as a parent".format(node, node.getParent)) - } - } - smallRoots.foreach(dropNode(bigRoots, _)) - } - -} diff --git a/src/main/scala/cc/factorie/app/nlp/hcoref/TACCoref.scala b/src/main/scala/cc/factorie/app/nlp/hcoref/TACCoref.scala deleted file mode 100644 index 8b0828d..0000000 --- a/src/main/scala/cc/factorie/app/nlp/hcoref/TACCoref.scala +++ /dev/null @@ -1,529 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -/* -package cc.factorie.app.nlp.hcoref - -import java.io._ -import java.util.zip.GZIPInputStream - -import cc.factorie._ -import cc.factorie.app.nlp._ -import cc.factorie.app.nlp.coref.ParseForwardCoref -import cc.factorie.app.nlp.ner.NoEmbeddingsConllStackedChainNer -import cc.factorie.app.nlp.parse.OntonotesTransitionBasedParser -import cc.factorie.app.nlp.phrase.Phrase -import cc.factorie.app.nlp.pos.OntonotesForwardPosTagger -import cc.factorie.app.nlp.segment.{DeterministicNormalizingTokenizer, DeterministicSentenceSegmenter} -import cc.factorie.util.{NonValidatingXML, VectorUtils} -import cc.factorie.variable.{BagOfWordsVariable, CategoricalDomain, DenseDoubleBagVariable} - -import scala.collection.mutable.{ArrayBuffer, HashMap} -import scala.io.Source -import scala.util.Random -*/ -/** - * @author John Sullivan - */ -/* -object TACCorefWithFactorie { - def main(args:Array[String]) { - val tacRoot = args(0) - val evalPath = args(1) - - val map = new Tac2009FlatDocumentMap(tacRoot) - - val refMentions = ProcessQueries.loadQueries(evalPath + ".xml", evalPath + ".tab") - - println("loaded %d mentions/queries in %d entities.".format(refMentions.size, refMentions.map(_.entId).toSet.size)) - - val pipelineElements = Seq( - DeterministicNormalizingTokenizer, - DeterministicSentenceSegmenter, - OntonotesForwardPosTagger, - NoEmbeddingsConllStackedChainNer, - OntonotesTransitionBasedParser, - ParseForwardCoref - ) - - val pipeline = DocumentAnnotatorPipeline(DocumentAnnotatorPipeline.defaultDocumentAnnotationMap.toMap, Nil, pipelineElements.flatMap(_.postAttrs)) - - println("Processing ref mentions and documents: ") - refMentions.par.foreach{ rMention => - val doc = new Document(map.getDoc(rMention.docId).toIterator.mkString("\n")).setName(rMention.docId) - rMention.doc = Some(doc) - rMention.getTokenSpan.map(ts => doc.getCoref.addMention(new Phrase(ts))) // we add our gold mentions before coref and processing - pipeline.process(doc) - print(".") - } - - val converter = new RefMentionConverter(pipeline) - - val mentions = refMentions.flatMap(converter.toDocEntNode).toSeq - println("Found %d mentions in documents out of %d total mention (%.4f \\%)".format(mentions.size, refMentions.size, mentions.size.toDouble/refMentions.size)) - - val splitPoint = (mentions.size * 0.75).toInt - val (train, test) = mentions.splitAt(splitPoint) - - println("Split into %d training and %d testing".format(train.size, test.size)) - implicit val rand = new Random() - - val tacCoref = new DocEntityCoref {implicit val random: Random = rand - - def estimateIterations(mentionCount: Int) = mentionCount * 100 - - val model = new DocEntityCorefModel(4.0, 0.25, 1.0, 2.0, 0.25, 1.0, 0.25, 3.0, 0.25, 1.0, 0.25) - - val autoStopThreshold = 10000 - } - - val sampler = tacCoref.getSampler(test) - sampler.infer - } -} - -object TACCoref { - - //val tagger = new OntonotesForwardPosTagger() - - def main(args:Array[String]) { - val tacRoot = args(0) - val evalPath = args(1) - val embeddingFile = args(2) - - val embeddings = EmbeddingSpace.fromFile(embeddingFile) - - val map = new Tac2009FlatDocumentMap(tacRoot) - - val refMentions = ProcessQueries.loadQueries(evalPath + ".xml", evalPath + ".tab") - - val mentions = refMentions.flatMap{ rMention => - val doc = new Document(map.getDoc(rMention.docId).toIterator.mkString("\n")).setName(rMention.docId) - DeterministicNormalizingTokenizer.process(doc) - DeterministicSentenceSegmenter.process(doc) - rMention.doc = Some(doc) - - val tokenSpanOpt = doc.getSectionByOffsets(rMention.getOffsets._1, rMention.getOffsets._2).getOrElse(doc.asSection).offsetSnapToTokens(rMention.getOffsets._1, rMention.getOffsets._2) - if(tokenSpanOpt.isEmpty) { - println("for doc %s didn't find token span from name %s and offsets: %s".format(rMention.docId, rMention.name, rMention.getOffsets)) - } - tokenSpanOpt.map{ tokenSpan => - - val nameBag = new BagOfWordsVariable() - val contextBag = new BagOfWordsVariable() - val nerBag = new BagOfWordsVariable() - val mentionBag = new BagOfWordsVariable() - val numberBag = new BagOfWordsVariable() - val truth = new BagOfWordsVariable() - val contextVec = new DenseDoubleBagVariable(50) - - - nameBag ++= tokenSpan.tokens.map(_.string) - contextBag ++= tokenSpan.contextWindow(10).groupBy(_.string).mapValues(_.size.toDouble) - contextVec.set(embeddings.embedPhrase(contextBag.value.asHashMap.keySet.toSeq))(null) - nerBag += rMention.entType - truth += rMention.entId - - new Mention[DenseDocEntityVars](new DenseDocEntityVars(nameBag, contextBag, nerBag, contextVec, numberBag, truth), rMention.id)(null) - } - } - println("done finding token spans and building mentions") - - val splitPoint = (mentions.size * 0.75).toInt - val (train, test) = mentions.splitAt(splitPoint) - - println("Split into %d training and %d testing".format(train.size, test.size)) - implicit val rand = new Random() - - class DocEntityModel(namesWeights:Double, namesShift:Double, nameEntropy:Double, contextsWeight:Double, contextsShift:Double, matchScore:Double, matchPenalty:Double, denseContextWeight:Double, denseContextShift:Double) extends CorefModel[DenseDocEntityVars] { - this += new ChildParentCosineDistance(namesWeights, namesShift, {v:DenseDocEntityVars => v.names}) - this += new ChildParentCosineDistance(contextsWeight, contextsShift, {v:DenseDocEntityVars => v.context}) - this += new MatchConstraint(matchScore, matchPenalty, {v:DenseDocEntityVars => v.nerType}) - this += new DenseCosineDistance(denseContextWeight, denseContextShift, {v:DenseDocEntityVars => v.contextVec}) - this += new BagOfWordsEntropy(nameEntropy, {v:DenseDocEntityVars => v.names}) - } - - - val model = new DocEntityModel(1.0, -0.25, 0.5, 1.0, -0.25, 1.0, -10.0, 1.0, -0.25) - - val trainer = new CorefSampler[DenseDocEntityVars](model, train, train.size * 100) - with AutoStoppingSampler[DenseDocEntityVars] - with CanopyPairGenerator[DenseDocEntityVars] - with NoSplitMoveGenerator[DenseDocEntityVars] - with DebugCoref[DenseDocEntityVars] - with TrainingObjective[DenseDocEntityVars] - with PrintlnLogger { - def newInstance(implicit d: DiffList) = new Node[DenseDocEntityVars](new DenseDocEntityVars()) - - val autoStopThreshold = 10000 - } - trainer.train(100000) - - println(trainer.model.parameters.tensors) - - val sampler = new CorefSampler[DenseDocEntityVars](model, test, test.size * 100) - with AutoStoppingSampler[DenseDocEntityVars] - with CanopyPairGenerator[DenseDocEntityVars] - with NoSplitMoveGenerator[DenseDocEntityVars] - with DebugCoref[DenseDocEntityVars] - with TrainingObjective[DenseDocEntityVars] - with PrintlnLogger { - def newInstance(implicit d: DiffList) = new Node[DenseDocEntityVars](new DenseDocEntityVars()) - - val autoStopThreshold = 10000 - } - - sampler.infer - - //println(EvaluatableClustering.evaluationString(test.predictedClustering, test.trueClustering)) - val goldMap = test.map { mention => - mention.variables.truth.value.asHashMap.keySet.head -> mention.uniqueId - }.groupBy(_._1).mapValues(_.map(_._2).toSet) - - val predMap = test.map{m:Node[DenseDocEntityVars] => m.root}.toSet.map { entities:Node[DenseDocEntityVars] => - entities.variables.truth.value.topWord -> entities.mentions.map(_.uniqueId).toSet - }.toMap - //println(LinkingScorer.scoreString(predMap, goldMap)) - } -} - -/** - * Takes a docId and returns the raw text of the corresponding document - */ -trait DocumentMap { - def getDoc(docId:String):BufferedReader -} - -class Tac2009FlatDocumentMap(tacRoot:String) extends DocumentMap { - def getDoc(docId:String):BufferedReader = { - val filePath = s"$tacRoot/$docId.sgm" - new BufferedReader(new FileReader(filePath)) - } -} - -object ProcessQueries { - - - def loadQueries(queryXMLFile:String, queryTabFile:String):Iterable[ReferenceMention] = { - val entMap = Source.fromFile(queryTabFile).getLines().map { line => - val Array(mentId, entId, entType) = line.split("\\s+") - mentId -> (entId, entType) - }.toMap - - NonValidatingXML.loadFile(queryXMLFile).\\("kbpentlink").\\("query").map { qXML => - val id = (qXML \ "@id").text.trim - val name = (qXML \ "name").text.trim - val docName = (qXML \ "docid").text.trim - val beg = qXML \ "beg" - val end = qXML \ "end" - assert(beg.isEmpty == end.isEmpty) - val offsets:Option[(Int, Int)] = if (beg.isEmpty || end.isEmpty) None else Some(beg.text.toInt, end.text.toInt) - ReferenceMention(id, name, docName, offsets, entMap(id)._1, entMap(id)._2) - } - } -} - -case class ReferenceMention(id:String, name:String, docId:String, offsets:Option[(Int, Int)], entId:String, entType:String) { - var doc:Option[Document] = None - def getOffsets:(Int, Int) = offsets.getOrElse { - val start = doc.get.string.replaceAll("""-\n""","-").replaceAll("""\n"""," ").indexOfSlice(name) - val end = start + name.length - 1 - start -> end - } - def getTokenSpan = doc.get.getSectionByOffsets(this.getOffsets._1, this.getOffsets._2).getOrElse(doc.get.asSection).offsetSnapToTokens(this.getOffsets._1, this.getOffsets._2) -} - -object RefMentionConverterNoPipeline { - def toDocEntNode(ref:ReferenceMention):Option[Mention[DocEntityVars]] = { - val doc = ref.doc.get - DeterministicNormalizingTokenizer.process(doc) - DeterministicSentenceSegmenter.process(doc) - - val offsetOpt = ref.offsets match { - case None => - ref.name.r.findFirstMatchIn(doc.string).map(m => m.start -> m.end) - case otw => otw - } - offsetOpt.flatMap{ case (s, e) => - doc.getSectionByOffsets(s, e).flatMap(_.offsetSnapToTokens(s, e)) match { - case Some(refSpan) => - implicit val d:DiffList = null - val xMent = new Mention[DocEntityVars](new DocEntityVars()) - xMent.variables.names ++= refSpan.map{t:Token => t.lemmaString}.toCountBag - xMent.variables.context ++= refSpan.contextWindow(10).map(_.lemmaString).toCountBag - - Option(doc.coref).flatMap{_.findOverlapping(refSpan)} match { - case Some(ment) => - xMent.variables.++=(DocEntityVars.fromWithinDocEntity(ment.entity))(null) - xMent.withinDocEntityId = ment.entity.uniqueId - case None => println("Could not find coref or align mention: " + ref) - } - Some(xMent) - case None => - println("WARNING: Failed to find tokens for reference mention: " + ref) - None - } - } - } -} - -class RefMentionConverter(val pipeline:DocumentAnnotationPipeline) { - - def toDocEntNode(ref:ReferenceMention):Option[Mention[DocEntityVars]] = { - val doc = pipeline.process(ref.doc.get) - - val offsetOpt = ref.offsets match { - case None => - ref.name.r.findFirstMatchIn(doc.string).map(m => m.start -> m.end) - case otw => otw - } - offsetOpt.flatMap{ case (s, e) => - doc.getSectionByOffsets(s, e).flatMap(_.offsetSnapToTokens(s, e)) match { - case Some(refSpan) => - implicit val d:DiffList = null - val xMent = new Mention[DocEntityVars](new DocEntityVars(), ref.id) - xMent.variables.names ++= refSpan.map{t:Token => t.lemmaString}.toCountBag - xMent.variables.context ++= refSpan.contextWindow(10).map(_.lemmaString).toCountBag - xMent.variables.truth += ref.entId - - Option(doc.coref).flatMap{_.findOverlapping(refSpan)} match { - case Some(ment) => - xMent.variables.++=(DocEntityVars.fromWithinDocEntity(ment.entity))(null) - xMent.withinDocEntityId = ment.entity.uniqueId - case None => println("Could not find coref or align mention: " + ref) - } - Some(xMent) - case None => - println("WARNING: Failed to find tokens for reference mention: " + ref) - None - } - } - } -} - -object GenerateEmbeddings { - def main(args:Array[String]) { - val tacRoot = args(0) - val evalPath = args(1) - val embeddingFilename = args(2) - - val map = new Tac2009FlatDocumentMap(tacRoot) - - val refMentions = ProcessQueries.loadQueries(evalPath + ".xml", evalPath + ".tab") - - val tokens = refMentions.map{ rMention => - val doc = new Document(map.getDoc(rMention.docId).toIterator.mkString("\n")).setName(rMention.docId) - DeterministicNormalizingTokenizer.process(doc) - DeterministicSentenceSegmenter.process(doc) - doc.tokens.map(_.lemmaString) - } - - println("loaded and tokenized, starting embeddings") - - val dimensions = 50 - val iterations = 10 - val regularizer = 10 - val learningRate = 0.1 - - - val random = new scala.util.Random(0) - val domain = new CategoricalDomain[String]() - val space = new EmbeddingSpace(domain,dimensions,random) - println("embeddings initialized") - space.learnEmbeddingsFromText(tokens,iterations,regularizer,learningRate) - - println("writing embeddings") - Embeddings.writeEmbedding(new File(embeddingFilename), space) - //testEmbeddings(space,test) - } -} - -object EmbeddingSpace{ - import VectorUtils._ - def fromFile(fileName:String):EmbeddingSpace ={ - val reader = if(fileName.endsWith(".gz") || fileName.endsWith("tgz")) new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(new File(fileName))))) - else new BufferedReader(new InputStreamReader(new FileInputStream(new File(fileName)))) - var result:EmbeddingSpace=null - val map = new HashMap[String,Array[Double]] - var line: String = "" - //val tmpResult = new ArrayBuffer[Pair[String,Array[Double]]] - while({line = reader.readLine(); line != null}){ - val pair = line.split("[\t]") - assert(pair.length == 2, "{%s} is %d in length" format(line, pair.length)) - val weights = pair(1).split(" ").map(e => e.toDouble) - if (result==null)result = new EmbeddingSpace(new CategoricalDomain[String],weights.length,new scala.util.Random(0)) - result.setEmbedding(pair(0),weights) - } - result - } - def stopWordStats(space:EmbeddingSpace,stop:Seq[String],control:Seq[String]){ - val mean = zero(space.dimensionality) - var meanNorm = 0.0 - //val variance = zero(space.dimensionality) - space.wordTypes.foreach(mean += _) - space.wordTypes.foreach(meanNorm += _.twoNorm) - mean /= space.wordTypes.size.toDouble - meanNorm /= space.wordTypes.size.toDouble - //space.wordTypes.foreach(x => variance += x.twoDistance(mean)) - println("Mean: "+mean.mkString(",")) - println("||Mean||: "+mean.twoNorm) - println("Average ||Mean||: "+meanNorm) - val wordsAndLabels = stop.map(_ -> "stop") ++ control.map(_ -> "ctrl") - val numStops = wordsAndLabels.filter(_._2=="stop").size - val numControl = wordsAndLabels.size-numStops - var stopFromMean=0.0 - var controlFromMean=0.0 - println("Words: ") - for((word,label) <- wordsAndLabels){ - val x = space.getOrElseZero(word) - val norm = x.twoNorm - val toMean = (x-mean).twoNorm - val h = x.normalizedEntropyForLogValues - if (label=="stop")stopFromMean+=toMean else controlFromMean+=toMean - //if (label=="stop")stopFromMean+=h else controlFromMean+=h - println(" "+label+" "+h+" "+toMean+" "+word+" "+norm) - } - stopFromMean /= numStops - controlFromMean /= numControl - val boundary = (stopFromMean + controlFromMean)/2 - println("Stop from mean: "+stopFromMean) - println("Control from mean: "+controlFromMean) - var numCorrect=0 - var total=0 - for((word,label) <- wordsAndLabels){ - val x = space.getOrElseZero(word) - val toMean = (x-mean).twoNorm - val predictStop = toMean < boundary - val isStop = label=="stop" - if((predictStop && isStop) || (!predictStop && !isStop))numCorrect += 1 - total+=1 - } - println("Accuracy: "+numCorrect.toDouble/total.toDouble) - } -} -class EmbeddingSpace(val domain:CategoricalDomain[String],val dimensionality:Int,val random:scala.util.Random){ - import VectorUtils._ - val wordTypes = new ArrayBuffer[Array[Double]] - def mean = {val r = zero(dimensionality);var i=0;while(inew EmbeddingExample(ws.toIndexedSeq,this)).toIndexedSeq,iterations,regularizer,learningRate) - } - def learnEmbeddings(examples:IndexedSeq[EmbeddingExample],iterations:Int,regularizer:Double,learningRate:Double){ - assert(examples.forall(_.space eq this)) - assert(examples.forall(_.words.length>1)) - println("Learning embeddings.") - for (i <- 1 to iterations){ - println("Iteration "+i) - var j=0 - for (example <- random.shuffle(examples)){ - gradientStep(example,examples(random.nextInt(examples.size)),regularizer,learningRate*2.0/(math.sqrt(1.0+i.toDouble))) - j+=1 - } - monitorDoc(examples.head) - println("Num updates: "+numUpdates+" out of "+numSteps+" opportunities.") - } - } - def monitorDoc(example:EmbeddingExample){ - println(" Monitoring example") - for(w <- example.words){ - val v = getOrElseZero(w) - println(" -w: "+w+" v: "+v.twoNorm()) - } - } - var numUpdates=0 - var numSteps=0 - def gradientStep(example:EmbeddingExample,counterExample:EmbeddingExample,regularizer:Double,learningRate:Double){ - val margin = regularizer/10.0 - var i=0 - val totalSum = example.computeSum() - assert(!totalSum.hasNaN) - while(iregularizer)wordv/=(norm/regularizer) - numUpdates += 1 - } - numSteps += 1 - i+=1 - } - } - def newEmbedding(s:String) = randomArray(dimensionality,random)/dimensionality -} -class EmbeddingExample(val words:IndexedSeq[String],val space:EmbeddingSpace){ - import VectorUtils._ - val wordVectors = words.map(space(_)) - def computeSum():Array[Double]={val contextSum=zero(space.dimensionality);wordVectors.foreach(contextSum += _);contextSum} -} - -object Embeddings{ - //val test = Seq("vldb","emnlp","icml","nips","icvpr","acl","relation extraction","database","knowledge base","entity","coreference","graphical model","approach","face","physics","machine learning","cryptography","graphics","networks","learning","amccallum","elearnedmiller","amoore","speytonjones","ablum","tmitchell","dkarger") - - def writeEmbedding(file:File,space:EmbeddingSpace){ - val out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file))) - for(word <- space.domain.categories){ - val vec = space.getOrElseZero(word) - out.write(word+"\t"+vec.mkString(" ")+"\n") - out.flush - } - out.flush - out.close - } -} -*/ \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/nlp/hcoref/TrainingObjective.scala b/src/main/scala/cc/factorie/app/nlp/hcoref/TrainingObjective.scala deleted file mode 100644 index 09fd2df..0000000 --- a/src/main/scala/cc/factorie/app/nlp/hcoref/TrainingObjective.scala +++ /dev/null @@ -1,80 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.app.nlp.hcoref - -import cc.factorie._ -import cc.factorie.model.TupleTemplateWithStatistics2 -import cc.factorie.optimize.{MIRA, ParameterAveraging, SampleRankTrainer} -import cc.factorie.variable.{BagOfWordsVariable, BooleanValue} - -import scala.reflect.ClassTag - -/** - * @author John Sullivan - */ -trait TrainingObjective[Vars <: NodeVariables[Vars] with GroundTruth] { - this:CorefSampler[Vars] with PairGenerator[Vars] => - - override def objective = new CorefTrainerModel[Vars] - - val averager = new MIRA with ParameterAveraging - val trainer = new SampleRankTrainer(this, averager) - - def train(numSteps:Int) { - println("Starting %d training iterations".format(numSteps)) - (0 until numSteps).foreach { idx => - trainer.processContext(nextContext) - } - println("training complete") - averager.setWeightsToAverage(model.asInstanceOf[Parameters].parameters) - } -} - -trait GroundTruth { - this: NodeVariables[_] => - def truth:BagOfWordsVariable -} - -class PairwiseTrainerFeature[Vars <: NodeVariables[Vars] with GroundTruth](val precisionDominated:Double = 0.95)(implicit ct:ClassTag[Vars]) extends TupleTemplateWithStatistics2[Vars, Node[Vars]#Exists] { - def unroll1(vars:Vars) = if(vars.node.isRoot) Factor(vars, vars.node.existsVar) else Nil - def unroll2(isEntity:Node[Vars]#Exists) = if(isEntity.node.isRoot) Factor(isEntity.node.variables, isEntity) else Nil - override def score(vars:Vars, isEntity:BooleanValue):Double ={ - var result = 0.0 - //val bag = s._1 - val bagSeq = vars.truth.iterator.toSeq - var i=0;var j=0 - var tp = 0.0 - var fp = 0.0 - while(i - v.srcId = node2.uniqueId.toString - v.srcDepth = node2.depth - 1 - v.srcBagSize = getBagSize(node2) - v.srcIsEnt = node2.isRoot - v.srcIsMent = node2.isMention - v.srcMentionCount = node2.mentionCountVar.value - - v.dstId = node1.uniqueId.toString - v.dstDepth = node1.depth - 1 - v.dstBagSize = getBagSize(node1) - v.dstIsEnt = node1.isRoot - v.dstIsMent = node1.isMention - v.dstMentionCount = node1.mentionCountVar.value - - - v.moveType = this.name - - case _ => println("Difflist not Verbosity: %s (in move %s)".format(d, this.name)) - } - val res = super.operation(node1, node2)(d) - d match { - case v:Verbosity => - node1.getParent match { - case Some(p) => v.newParentId = p.uniqueId - case None => Unit - } - case _ => Unit - } - res - } -} - -trait VerboseMoveGenerator[Vars <: NodeVariables[Vars]] extends MoveGenerator[Vars] { - this :SettingsSampler[(Node[Vars], Node[Vars])] => - - def outerGetBagSize(n:Node[Vars]):Int - - def settings(c:(Node[Vars], Node[Vars])) = new VerboseSettingIterator with MoveSettingIterator[Vars] { - var (e1, e2) = c - - val moves = new scala.collection.mutable.ArrayBuffer[Move[Vars]]() - - if(e1.root != e2.root) { - if(e1.isMention && e1.isRoot && e2.isMention && e2.isRoot) { - moves += new MergeUp[Vars](e1, e2)({d => newInstance(d)}) - } else if(e1.isMention && e2.isMention) { - if(e1.parent != null) { - moves += new MergeLeft[Vars](e1.parent, e2) - } - if(e2.parent != null) { - moves += new MergeLeft[Vars](e2.parent, e1) - } - } else { - while (e1 != null) { - if(e1.mentionCountVar.value >= e2.mentionCountVar.value && !e1.isMention) { - moves += new MergeLeft[Vars](e1, e2) - } else { - if(e2.isMention) { // we should only be here if e2 has a parent - moves += new MergeLeft[Vars](e2.parent, e1) - } else { - moves += new MergeLeft[Vars](e2, e1) - } - } - e1 = e1.getParent.getOrElse(null.asInstanceOf[Node[Vars]]) - } - } - } else { - if(e1.mentionCountVar.value > e2.mentionCountVar.value) { - moves += new SplitRight[Vars](e2, e1) with VerboseMove[Vars] {def getBagSize(n:Node[Vars]) = outerGetBagSize(n)} - } else { - moves += new SplitRight[Vars](e1, e2) with VerboseMove[Vars] {def getBagSize(n:Node[Vars]) = outerGetBagSize(n)} - } - } - - moves += new NoMove[Vars] with VerboseMove[Vars] {def getBagSize(n:Node[Vars]) = outerGetBagSize(n)} - } -} - -trait VerboseSampler[C] { - this: SettingsSampler[C] => - - private val _verbosities = new mutable.ArrayBuffer[Verbosity]() - def verbosities:Iterable[Verbosity] = _verbosities - private var numSamples = 0 - - override def proposals(context:C): Seq[Proposal[C]] = { - val result = new mutable.ArrayBuffer[Proposal[C]] - // the call to 'next' is actually what causes the change in state to happen - var i = 0 - val si = settings(context) - while (si.hasNext) { - val d = si.next() - assert(model ne null) // TODO!!! Clean up and delete this - val (m,o) = d.scoreAndUndo(model, objective) - d match { - case v:Verbosity => { - v.deltaScore = m - v.samplingStep = numSamples - } - case _ => println("Difflist not Verbosity: %s (In Proposal)".format(d)) - } - //if (proposalsCache.length == i) proposalsCache.append(null) - result += new Proposal(d, m, o, m/temperature, context) - i += 1 - } - result - } - - override def processProposals(props: Seq[Proposal[C]]): DiffList = { - if (props.size == 0 && skipEmptyProposals) return newDiffList - proposalsHook(props) - val proposal = props.size match { - case 0 => throw new Error("No proposals created.") - case 1 => props.head - case _ => { - val p = pickProposal(props) - p.diff match { - case v:Verbosity => v.accepted = true - case _ => println("Difflist not Verbosity: %s (In Proposal)".format(p.diff)) - } - //p.diff.asInstanceOf[Verbosity].accepted = true - p - } - } - props.filter(_.diff.size != 0).foreach { prop => - prop.diff match { - case v:Verbosity => - if(!v.accepted) { - v.newParentId = "" - } - _verbosities += v - - case _ => Unit - } - } - numSamples += 1 - proposal.diff.redo() - proposalHook(proposal) - proposal.diff - } -} \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/nlp/hcoref/package.scala b/src/main/scala/cc/factorie/app/nlp/hcoref/package.scala deleted file mode 100644 index fa75a04..0000000 --- a/src/main/scala/cc/factorie/app/nlp/hcoref/package.scala +++ /dev/null @@ -1,46 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.app.nlp - -import cc.factorie.util.BasicEvaluatableClustering - -/** - * @author John Sullivan - */ -package object hcoref { - implicit class NodeListUtils[Vars <: NodeVariables[Vars]](val nodes:Iterable[Node[Vars]]) { - private val mentionToRoot = nodes.filter(_.isMention).map(m => m.uniqueId -> m.root.uniqueId) - def predictedClustering = new BasicEvaluatableClustering(mentionToRoot) - - def toSingletons() { - nodes.foreach { node => - node.alterParent(None)(null) - } - } - } - - implicit class MentionListUtils[Vars <: NodeVariables[Vars]](val ments:Iterable[Node[Vars]]) extends AnyVal { - def roots = ments.map(_.root).toSet.toSeq - def nonMentionRoots = ments.map(_.root).filterNot(_.isMention).toSet.toSeq - } - - implicit class NodeListGroundTruthUtils[Vars <: NodeVariables[Vars] with GroundTruth](val nodes:Iterable[Node[Vars]]) { - //this logic is ugly, but should always be correct for mentions - private lazy val mentionToTruth = nodes.filter(_.isMention) - .map(m => m.uniqueId -> m.variables.truth.iterator.next()._1) - def trueClustering = new BasicEvaluatableClustering(mentionToTruth) - - - def labeled = nodes.filter(_.variables.truth.size > 0) - } -} diff --git a/src/main/scala/cc/factorie/app/nlp/lemma/CollapseDigitsLemmatizer.scala b/src/main/scala/cc/factorie/app/nlp/lemma/CollapseDigitsLemmatizer.scala index 183ff5e..449a537 100644 --- a/src/main/scala/cc/factorie/app/nlp/lemma/CollapseDigitsLemmatizer.scala +++ b/src/main/scala/cc/factorie/app/nlp/lemma/CollapseDigitsLemmatizer.scala @@ -11,7 +11,8 @@ See the License for the specific language governing permissions and limitations under the License. */ package cc.factorie.app.nlp.lemma -import cc.factorie.app.nlp._ + +import cc.factorie.app.nlp.{Document, DocumentAnnotator, Token} class CollapseDigitsLemmatizer extends DocumentAnnotator with Lemmatizer { def lemmatize(word:String): String = cc.factorie.app.strings.collapseDigits(word) @@ -25,4 +26,4 @@ class CollapseDigitsLemmatizer extends DocumentAnnotator with Lemmatizer { } object CollapseDigitsLemmatizer extends CollapseDigitsLemmatizer -class CollapseDigitsTokenLemma(token:Token, s:String) extends TokenLemma(token, s) + diff --git a/src/main/scala/cc/factorie/app/nlp/lemma/CollapseDigitsTokenLemma.scala b/src/main/scala/cc/factorie/app/nlp/lemma/CollapseDigitsTokenLemma.scala new file mode 100644 index 0000000..06e411a --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/lemma/CollapseDigitsTokenLemma.scala @@ -0,0 +1,5 @@ +package cc.factorie.app.nlp.lemma + +import cc.factorie.app.nlp.Token + +class CollapseDigitsTokenLemma(token:Token, s:String) extends TokenLemma(token, s) diff --git a/src/main/scala/cc/factorie/app/nlp/lemma/Lemmatizer.scala b/src/main/scala/cc/factorie/app/nlp/lemma/Lemmatizer.scala index 7a85904..d79f50a 100644 --- a/src/main/scala/cc/factorie/app/nlp/lemma/Lemmatizer.scala +++ b/src/main/scala/cc/factorie/app/nlp/lemma/Lemmatizer.scala @@ -1,21 +1,5 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ package cc.factorie.app.nlp.lemma trait Lemmatizer { def lemmatize(word:String): String } - -object NoopLemmatizer extends Lemmatizer { - def lemmatize(word:String): String = word -} diff --git a/src/main/scala/cc/factorie/app/nlp/lemma/LowercaseLemmatizer.scala b/src/main/scala/cc/factorie/app/nlp/lemma/LowercaseLemmatizer.scala index 17b24ec..f383aa1 100644 --- a/src/main/scala/cc/factorie/app/nlp/lemma/LowercaseLemmatizer.scala +++ b/src/main/scala/cc/factorie/app/nlp/lemma/LowercaseLemmatizer.scala @@ -11,7 +11,9 @@ See the License for the specific language governing permissions and limitations under the License. */ package cc.factorie.app.nlp.lemma -import cc.factorie.app.nlp._ + +import cc.factorie.app.nlp.{Document, DocumentAnnotator, Token} + class LowercaseLemmatizer extends DocumentAnnotator with Lemmatizer with Serializable { def lemmatize(word:String): String = word.toLowerCase @@ -25,4 +27,4 @@ class LowercaseLemmatizer extends DocumentAnnotator with Lemmatizer with Seriali } object LowercaseLemmatizer extends LowercaseLemmatizer -class LowercaseTokenLemma(token:Token, s:String) extends TokenLemma(token, s) + diff --git a/src/main/scala/cc/factorie/app/nlp/lemma/LowercaseTokenLemma.scala b/src/main/scala/cc/factorie/app/nlp/lemma/LowercaseTokenLemma.scala new file mode 100644 index 0000000..7abec76 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/lemma/LowercaseTokenLemma.scala @@ -0,0 +1,6 @@ +package cc.factorie.app.nlp.lemma + +import cc.factorie.app.nlp.Token + +class LowercaseTokenLemma(token:Token, s:String) extends TokenLemma(token, s) + diff --git a/src/main/scala/cc/factorie/app/nlp/lemma/PorterLemmatizer.scala b/src/main/scala/cc/factorie/app/nlp/lemma/PorterLemmatizer.scala index f09189c..ef0d96d 100644 --- a/src/main/scala/cc/factorie/app/nlp/lemma/PorterLemmatizer.scala +++ b/src/main/scala/cc/factorie/app/nlp/lemma/PorterLemmatizer.scala @@ -10,12 +10,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -/* + package cc.factorie.app.nlp.lemma -import cc.factorie.app.nlp._ + +import cc.factorie.app.nlp.{Document, DocumentAnnotator, Token} +import cc.factorie.app.strings.PorterStemmer + class PorterLemmatizer extends DocumentAnnotator with Lemmatizer { - def lemmatize(word:String): String = cc.factorie.app.strings.PorterStemmer(word) + def lemmatize(word:String): String = PorterStemmer(word) def process(document:Document): Document = { for (token <- document.tokens) token.attr += new PorterTokenLemma(token, lemmatize(token.string)) document @@ -26,5 +29,4 @@ class PorterLemmatizer extends DocumentAnnotator with Lemmatizer { } object PorterLemmatizer extends PorterLemmatizer -class PorterTokenLemma(token:Token, s:String) extends TokenLemma(token, s) -*/ \ No newline at end of file + diff --git a/src/main/scala/cc/factorie/app/nlp/lemma/PorterTokenLemma.scala b/src/main/scala/cc/factorie/app/nlp/lemma/PorterTokenLemma.scala new file mode 100644 index 0000000..a8e9d6d --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/lemma/PorterTokenLemma.scala @@ -0,0 +1,6 @@ +package cc.factorie.app.nlp.lemma + +import cc.factorie.app.nlp.Token + +class PorterTokenLemma(token:Token, s:String) extends TokenLemma(token, s) + diff --git a/src/main/scala/cc/factorie/app/nlp/lemma/SimplifyDigitsLemmatizer.scala b/src/main/scala/cc/factorie/app/nlp/lemma/SimplifyDigitsLemmatizer.scala index 5a43ad8..995ec8b 100644 --- a/src/main/scala/cc/factorie/app/nlp/lemma/SimplifyDigitsLemmatizer.scala +++ b/src/main/scala/cc/factorie/app/nlp/lemma/SimplifyDigitsLemmatizer.scala @@ -11,7 +11,9 @@ See the License for the specific language governing permissions and limitations under the License. */ package cc.factorie.app.nlp.lemma -import cc.factorie.app.nlp._ + +import cc.factorie.app.nlp.{Document, DocumentAnnotator, Token} + class SimplifyDigitsLemmatizer extends DocumentAnnotator { def lemmatize(word:String): String = cc.factorie.app.strings.simplifyDigits(word) @@ -25,4 +27,4 @@ class SimplifyDigitsLemmatizer extends DocumentAnnotator { } object SimplifyDigitsLemmatizer extends SimplifyDigitsLemmatizer -class SimplifyDigitsTokenLemma(token:Token, s:String) extends TokenLemma(token, s) + diff --git a/src/main/scala/cc/factorie/app/nlp/lemma/SimplifyDigitsTokenLemma.scala b/src/main/scala/cc/factorie/app/nlp/lemma/SimplifyDigitsTokenLemma.scala new file mode 100644 index 0000000..6db55f8 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/lemma/SimplifyDigitsTokenLemma.scala @@ -0,0 +1,9 @@ +package cc.factorie.app.nlp.lemma + +import cc.factorie.app.nlp.Token + +/** + * Created by andrew@andrewresearch.net on 28/10/17. + */ + +class SimplifyDigitsTokenLemma(token:Token, s:String) extends TokenLemma(token, s) diff --git a/src/main/scala/cc/factorie/app/nlp/lemma/TokenLemma.scala b/src/main/scala/cc/factorie/app/nlp/lemma/TokenLemma.scala index 33552f6..f59dadb 100644 --- a/src/main/scala/cc/factorie/app/nlp/lemma/TokenLemma.scala +++ b/src/main/scala/cc/factorie/app/nlp/lemma/TokenLemma.scala @@ -11,7 +11,8 @@ See the License for the specific language governing permissions and limitations under the License. */ package cc.factorie.app.nlp.lemma -import cc.factorie.app.nlp._ + +import cc.factorie.app.nlp.Token import cc.factorie.variable.StringVariable /** Used as an attribute of Token to hold the lemma of the Token.string. diff --git a/src/main/scala/cc/factorie/app/nlp/lemma/WordNetLemmatizer.scala b/src/main/scala/cc/factorie/app/nlp/lemma/WordNetLemmatizer.scala index 77ce0d9..5a2426e 100644 --- a/src/main/scala/cc/factorie/app/nlp/lemma/WordNetLemmatizer.scala +++ b/src/main/scala/cc/factorie/app/nlp/lemma/WordNetLemmatizer.scala @@ -14,9 +14,9 @@ package cc.factorie.app.nlp.lemma import java.io.{FileInputStream, InputStream} -import cc.factorie.app.nlp._ import cc.factorie.app.nlp.pos.{PennPosDomain, PennPosTag} import cc.factorie.app.nlp.wordnet.WordNet +import cc.factorie.app.nlp.{Document, DocumentAnnotator, Token} import cc.factorie.util.ClasspathURL import scala.io.Source @@ -168,5 +168,5 @@ object WordNetLemmatizer extends WordNetLemmatizer(string => ClasspathURL.fromDi // } //} -class WordNetTokenLemma(token:Token, s:String) extends TokenLemma(token, s) + diff --git a/src/main/scala/cc/factorie/app/nlp/lemma/WordNetTokenLemma.scala b/src/main/scala/cc/factorie/app/nlp/lemma/WordNetTokenLemma.scala new file mode 100644 index 0000000..a9535c0 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/lemma/WordNetTokenLemma.scala @@ -0,0 +1,9 @@ +package cc.factorie.app.nlp.lemma + +import cc.factorie.app.nlp.Token + +/** + * Created by andrew@andrewresearch.net on 28/10/17. + */ + +class WordNetTokenLemma(token:Token, s:String) extends TokenLemma(token, s) diff --git a/src/main/scala/cc/factorie/app/nlp/lexicon/AhoCorasick.scala b/src/main/scala/cc/factorie/app/nlp/lexicon/AhoCorasick.scala index 1dfadd1..1a23bb1 100644 --- a/src/main/scala/cc/factorie/app/nlp/lexicon/AhoCorasick.scala +++ b/src/main/scala/cc/factorie/app/nlp/lexicon/AhoCorasick.scala @@ -205,32 +205,4 @@ object AhoCorasick { private val logger = Logger.getLogger("cc.factorie.app.nlp.lexicon.AhoCorasick") } -/** - * An Aho-Corasick mention, containing the mention string, and the start & end - * character indices in the original text. - */ -class LexiconMention(val mention : String, val startIdx : Int, val endIdx : Int) extends Serializable { - override def toString() : String = { "Mention: " + mention + ", startIdx = " + startIdx + ", endIdx = " + endIdx } - - override def hashCode() : Int = { mention.hashCode() ^ startIdx ^ endIdx } - - override def equals(obj : Any) : Boolean = { - if (obj == null) { - return false - } - if (getClass() != obj.getClass()) { - return false - } - val other = obj.asInstanceOf[LexiconMention] - if (!this.mention.equals(other.mention)) { - return false - } - if (this.startIdx != other.startIdx) { - return false - } - if (this.endIdx != other.endIdx) { - return false - } - return true - } -} + diff --git a/src/main/scala/cc/factorie/app/nlp/lexicon/CustomStopWords.scala b/src/main/scala/cc/factorie/app/nlp/lexicon/CustomStopWords.scala new file mode 100644 index 0000000..dceb169 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/lexicon/CustomStopWords.scala @@ -0,0 +1,19 @@ +package cc.factorie.app.nlp.lexicon + +import cc.factorie.app.nlp.lemma.LowercaseLemmatizer +import cc.factorie.app.strings.nonWhitespaceClassesSegmenter + +class CustomStopWords extends TriePhraseLexicon("CustomStopWords", nonWhitespaceClassesSegmenter, LowercaseLemmatizer) { + def this(filename: String) = { + this() + this ++= scala.io.Source.fromFile(filename) + } + def this(words: Seq[String]) = { + this() + words.foreach { w => this += w } + } +} + +object CustomStopWords { + def apply(filename: String) = new CustomStopWords(filename) +} \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/nlp/lexicon/Determiner.scala b/src/main/scala/cc/factorie/app/nlp/lexicon/Determiner.scala new file mode 100644 index 0000000..81d024e --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/lexicon/Determiner.scala @@ -0,0 +1,26 @@ +package cc.factorie.app.nlp.lexicon + +import cc.factorie.app.nlp.lemma.LowercaseLemmatizer +import cc.factorie.app.strings.nonWhitespaceClassesSegmenter + +object Determiner extends PhraseLexicon("Determiner", nonWhitespaceClassesSegmenter, LowercaseLemmatizer) { + this ++= + """the +a +this +an +that +some +all +these +no +any +those +another +both +each +every +either +neither +""" +} diff --git a/src/main/scala/cc/factorie/app/nlp/lexicon/GenericLexicon.scala b/src/main/scala/cc/factorie/app/nlp/lexicon/GenericLexicon.scala new file mode 100644 index 0000000..ad58c37 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/lexicon/GenericLexicon.scala @@ -0,0 +1,9 @@ +package cc.factorie.app.nlp.lexicon + +import cc.factorie.util.ModelProvider + +/** + * Created by andrew@andrewresearch.net on 28/10/17. + */ + +class GenericLexicon(name:String, val provider:ModelProvider[GenericLexicon]) extends TriePhraseLexicon(name) with ProvidedLexicon[GenericLexicon] \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/nlp/lexicon/Lexicon.scala b/src/main/scala/cc/factorie/app/nlp/lexicon/Lexicon.scala index 8ed3170..0542ddd 100644 --- a/src/main/scala/cc/factorie/app/nlp/lexicon/Lexicon.scala +++ b/src/main/scala/cc/factorie/app/nlp/lexicon/Lexicon.scala @@ -1,31 +1,18 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - package cc.factorie.app.nlp.lexicon -import cc.factorie.app.nlp.{Token, TokenSpan} +import java.io.File + +import cc.factorie.app.chain.Observation +import cc.factorie.app.nlp.TokenSpan import cc.factorie.app.nlp.lemma.{Lemmatizer, LowercaseLemmatizer} import cc.factorie.app.strings.StringSegmenter -import cc.factorie.variable.CategoricalVectorVar -import scala.collection.mutable.{ArrayBuffer, HashMap} -import scala.io.{Codec, Source} -import java.io.{File, InputStream} +import scala.io.Source -import cc.factorie.app.chain.Observation /** The general interface to a lexicon. Both WordLexicon and PhraseLexicon are subclasses. - @author Andrew McCallum */ + * + * @author Andrew McCallum */ trait Lexicon { /** An identifier for this lexicon, suitable for adding as a category to a FeatureVectorVariable[String]. */ def name: String @@ -46,29 +33,16 @@ trait Lexicon { def containsWords(words: Seq[String]): Boolean = containsLemmatizedWords(words.map(lemmatizer.lemmatize(_))) /** Is this Token (or more generally Observation) a member of a phrase in the lexicon (including single-word phrases)? The query.string will be processed by the lemmatizer. - For example if query.string is "New" and query.next.string is "York" and the two-word phrase "New York" is in the lexicon, + For example if query.string is "New" and query.next.string is "York" and the two-word phrase "New York" is in the lexicon, then this method will return true. But if query.next.string is "shoes" (and "New shoes" is not in the lexicon) this method will return false. */ def contains[T<:Observation[T]](query:T): Boolean def contains[T<:Observation[T]](query:Seq[T]): Boolean def contains(span:TokenSpan): Boolean = contains(span.value) - /** Is the input String in the lexicon. The input is tokenized and lemmatized; + /** Is the input String in the lexicon. The input is tokenized and lemmatized; if the tokenizer indicates that it is a multi-word phrase, it will be processed by containsWords, otherwise containsWord. */ def contains(untokenizedString:String): Boolean = { val words = tokenizer(untokenizedString).map(lemmatizer.lemmatize(_)).toSeq; if (words.length == 1) containsWord(words.head) else containsWords(words) } } -trait MutableLexicon extends Lexicon { - // For populating the lexicon - /** Tokenize and lemmatize the input String and add it as a single entry to the Lexicon */ - def +=(phrase:String): Unit - /** All a lines from the input Source to this lexicon. Source is assumed to contain multiple newline-separated lexicon entries */ - def ++=(source:Source): this.type = { for (line <- source.getLines()) { val phrase = line.trim; if (phrase.length > 0 && !phrase.startsWith("#")) MutableLexicon.this.+=(phrase) }; source.close(); this } - /** All a lines from the input String to this lexicon. String contains multiple newline-separated lexicon entries */ - def ++=(phrases:String): this.type = ++=(Source.fromString(phrases)) - /** All a lines from the input File to this lexicon. File contains multiple newline-separated lexicon entries */ - def ++=(file:File, enc:String = "UTF-8"): this.type = ++=(Source.fromFile(file, enc)) - /** Add all lines from the InputStream to this lexicon */ - def ++=(is:InputStream): this.type = this.++=(Source.fromInputStream(is)(Codec.UTF8)) -} /** Support for constructing Lexicons @author Andrew McCallum */ @@ -84,356 +58,3 @@ object Lexicon { def fromResource(resourceFilename:String, tokenizer:StringSegmenter = cc.factorie.app.strings.nonWhitespaceSegmenter, lemmatizer:Lemmatizer = LowercaseLemmatizer): Lexicon = fromSource(resourceFilename, Source.fromInputStream(getClass.getResourceAsStream(resourceFilename))) } - -/** A lexicon containing single words or multi-word phrases. - * @author Kate Silverstein - */ -@deprecated("Use TriePhraseLexicon instead", "Before 10/1/15") -class PhraseLexicon(val name: String, val tokenizer: StringSegmenter = cc.factorie.app.strings.nonWhitespaceSegmenter, val lemmatizer: Lemmatizer = LowercaseLemmatizer) extends MutableLexicon { - def this(file: File) = { this(file.toString, cc.factorie.app.strings.nonWhitespaceSegmenter, LowercaseLemmatizer); this.++=(Source.fromFile(file)(scala.io.Codec.UTF8))} - val wordTree = new SuffixTree(false) - def +=(phrase:String): Unit = { - val words: Seq[String] = tokenizer(phrase).toSeq - wordTree.add(words.map(lemmatizer.lemmatize(_))) - } - /** Checks whether the lexicon contains this already-lemmatized/tokenized single word */ - def containsLemmatizedWord(word: String): Boolean = { - containsLemmatizedWords(List(word).toSeq) - } - /** Checks whether the lexicon contains this already-lemmatized/tokenized phrase, where 'words' can either be - * single word or a multi-word expression. */ - def containsLemmatizedWords(words: Seq[String]): Boolean = { - wordTree.contains(words) - } - /** Tokenizes and lemmatizes the string of each entry in 'query', then checks if the sequence is in the lexicon*/ - def contains[T<:Observation[T]](query: Seq[T]): Boolean = { - val strings = query.map(_.string) - val tokenized = strings.flatMap(tokenizer(_)) - val lemmatized = tokenized.map(lemmatizer.lemmatize(_)).toSeq - containsLemmatizedWords(lemmatized) - } - /** Tokenizes and lemmatizes query.string, then checks if the sequence is in the lexicon */ - def contains[T<:Observation[T]](query: T): Boolean = { - val tokenized = tokenizer(query.string).toSeq - val lemmatized = tokenized.map(lemmatizer.lemmatize(_)) - containsLemmatizedWords(lemmatized) - } - override def toString(): String = { "" } - - /** Return length of match, or -1 if no match. */ - def startsAt[T<:Observation[T]](query:T): Int = { - if (contains(query)){ - val tokenized = tokenizer(query.string).toSeq - val lemmatized = tokenized.map(lemmatizer.lemmatize(_)) - return wordTree.getSuffixIndex(lemmatized, true) - } - -1 - } -} - -/** a union of many PhraseLexicons - * @author Kate Silverstein */ -class UnionLexicon(val name: String, val members: PhraseLexicon*) extends MutableLexicon { - def tokenizer: StringSegmenter = members.head.tokenizer - def lemmatizer: Lemmatizer = members.head.lemmatizer - def containsLemmatizedWord(word: String): Boolean = members.exists(_.containsLemmatizedWord(word)) - def containsLemmatizedWords(word: Seq[String]): Boolean = members.exists(_.containsLemmatizedWords(word)) - def contains[T<:Observation[T]](query: T): Boolean = members.exists(_.contains(query)) - def contains[T<:Observation[T]](query: Seq[T]): Boolean = members.exists(_.contains(query)) - def +=(s:String): Unit = {throw new Error("method not implemented for UnionLexicon")} - override def toString: String = { - var st = "UNION { " - members.foreach(st += _.toString()+" , ") - st += " } " - st - } -} - -/** - * A phrase lexicon based on Aho-Corasick Trie lookups. - * Use the tag text methods in preference to the other methods, which are preserved for compatibility. - * The other methods have the same semantics as the PhraseLexicon, which return true iff the whole string is in the lexicon. - */ -class TriePhraseLexicon(val name: String, val tokenizer: StringSegmenter = cc.factorie.app.strings.nonWhitespaceSegmenter, val lemmatizer: Lemmatizer = LowercaseLemmatizer, val sep: String = " ") extends MutableLexicon { - val trie = new AhoCorasick(sep) - - def +=(phrase:String): Unit = synchronized { - val words: Seq[String] = tokenizer(phrase).toSeq - trie += words.map(lemmatizer.lemmatize) - } - - /** All a lines from the input Source to this lexicon. Source is assumed to contain multiple newline-separated lexicon entries. - * Overriden to call setTransitions after reading the file. - */ - override def ++=(source:Source): this.type = synchronized { for (line <- source.getLines()) { val phrase = line.trim; if (phrase.length > 0) TriePhraseLexicon.this.+=(phrase) }; trie.setTransitions(); source.close(); this } - - def setTransitions() : Unit = synchronized { trie.setTransitions() } - - /** Checks whether the lexicon contains this already-lemmatized/tokenized single word */ - def containsLemmatizedWord(word: String): Boolean = { containsLemmatizedWords(List(word).toSeq) } - - /** Checks whether the lexicon contains this already-lemmatized/tokenized phrase, where 'words' can either be - * single word or a multi-word expression. */ - def containsLemmatizedWords(words: Seq[String]): Boolean = { - trie.findExactMention(words) - } - - /** Tokenizes and lemmatizes the string of each entry in 'query', then checks if the exact sequence is in the lexicon*/ - def contains[T<:Observation[T]](query: Seq[T]): Boolean = { - val strings = query.map(_.string) - val tokenized = strings.flatMap(tokenizer(_)) - val lemmatized = tokenized.map(lemmatizer.lemmatize(_)).toSeq - containsLemmatizedWords(lemmatized) - } - - /** Tokenizes and lemmatizes query.string, then checks if the exact sequence is in the lexicon */ - def contains[T<:Observation[T]](query: T): Boolean = { - val tokenized = tokenizer(query.string).toSeq - val lemmatized = tokenized.map(lemmatizer.lemmatize(_)) - containsLemmatizedWords(lemmatized) - } - - override def toString(): String = { "" } - - /** Tags each token with the specified tag, if it is present in the lexicon */ - def tagLemmatizedText(tokens : Seq[Token], featureFunc : (Token => CategoricalVectorVar[String]), tag : String) : Unit = { - trie.tagMentions(tokens,featureFunc,tag) - } - - /** Tags each token with the specified tag, if the lemmatized form is present in the lexicon */ - def tagText(tokens : Seq[Token], featureFunc : (Token => CategoricalVectorVar[String]), tag : String) : Unit = { - trie.lemmatizeAndTagMentions(tokens,featureFunc,tag,lemmatizer) - } - - /** Tags each token with the specified tag, if the lemmatized form is present in the lexicon */ - def tagText(tokens : Seq[Token], featureFunc : (Token => CategoricalVectorVar[String]), tag : String, lemmaFunc : (Token => String)) : Unit = { - trie.tagMentions(tokens,featureFunc,tag,lemmaFunc) - } -} - -/** - * A union lexicon of multiple TriePhraseLexicons. - * Has similar semantics to the TriePhraseLexicon. - */ -class TrieUnionLexicon[L <: TriePhraseLexicon](val name: String, val members: L*) extends MutableLexicon { - def tokenizer: StringSegmenter = members.head.tokenizer - def lemmatizer: Lemmatizer = members.head.lemmatizer - def containsLemmatizedWord(word: String): Boolean = members.exists(_.containsLemmatizedWord(word)) - def containsLemmatizedWords(word: Seq[String]): Boolean = members.exists(_.containsLemmatizedWords(word)) - def contains[T<:Observation[T]](query: T): Boolean = members.exists(_.contains(query)) - def contains[T<:Observation[T]](query: Seq[T]): Boolean = members.exists(_.contains(query)) - def +=(s:String): Unit = {throw new Error("TrieUnionLexicon is immutable. Append to the appropriate TriePhraseLexicon.")} - override def toString(): String = { - var st = "UNION { " - members.foreach(st += _.toString()+" , ") - st += " } " - st - } - - def tagLemmatizedText(tokens : Seq[Token], featureFunc : (Token => CategoricalVectorVar[String]), tag : String) : Unit = { - members.map(_.tagLemmatizedText(tokens,featureFunc,tag)) - } - - def tagText(tokens : Seq[Token], featureFunc : (Token => CategoricalVectorVar[String]), tag : String) : Unit = { - members.map(_.tagText(tokens,featureFunc,tag)) - } - - /** Tags each token with the specified tag, if the lemmatized form is present in the lexicon */ - def tagText(tokens : Seq[Token], featureFunc : (Token => CategoricalVectorVar[String]), tag : String, lemmaFunc : (Token => String)) : Unit = { - members.map(_.tagText(tokens,featureFunc,tag,lemmaFunc)) - } -} - -/** A union of multiple lexicons. Answer "contains" queries with true, if any of the member Lexicons contain the query. - @author Andrew McCallum */ -@deprecated("Use TriePhraseLexicon instead", "Before 10/1/15") -class ChainUnionLexicon(val name: String, val members:Lexicon*) extends Lexicon { - def tokenizer: StringSegmenter = members.head.tokenizer - def lemmatizer: Lemmatizer = members.head.lemmatizer - def containsLemmatizedWord(word:String): Boolean = members.exists(_.containsLemmatizedWord(word)) - def containsLemmatizedWords(words: Seq[String]): Boolean = members.exists(_.containsLemmatizedWords(words)) - def contains[T<:Observation[T]](query:T): Boolean = members.exists(_.contains(query)) - def contains[T<:Observation[T]](query:Seq[T]): Boolean = members.exists(_.contains(query)) -} - -/** A Lexicon that can only hold single-word lexicon entries, but which is efficient for this case. - with methods to check whether a String or Token (or more generally a cc.factorie.app.chain.Observation) is in the list. - @author Andrew McCallum */ -@deprecated("Use TriePhraseLexicon instead", "Before 10/1/15") -class ChainWordLexicon(val name:String, val tokenizer:StringSegmenter = cc.factorie.app.strings.nonWhitespaceSegmenter, val lemmatizer:Lemmatizer = LowercaseLemmatizer) extends MutableLexicon { - val contents = new scala.collection.mutable.HashSet[String] - def +=(phrase:String): Unit = { - val words: Seq[String] = tokenizer(phrase).toSeq - if (words.length == 1) contents += lemmatizer.lemmatize(words.head) else throw new MultiWordException("Cannot add multi-word phrase to WordLexicon: "+phrase+" => "+words+" by segmenter "+tokenizer.getClass) - } - final def containsLemmatizedWord(word:String): Boolean = contents.contains(word) - def contains[T<:Observation[T]](query:T): Boolean = containsWord(query.string) - def containsLemmatizedWords(words: Seq[String]): Boolean = if (words.length == 1) containsLemmatizedWord(words.head) else false - def contains[T<:Observation[T]](query:Seq[T]): Boolean = if (query.length == 1) containsWord(query.head.string) else false -} - -/** An exception thrown when someone tries to add a multi-word phrase to a WordLexicon. */ -class MultiWordException(msg:String) extends Exception(msg) - -/** A list of words or phrases, with methods to check whether a String, Seq[String], or Token (or more generally a cc.factorie.app.chain.Observation) is in the list. - @author Andrew McCallum */ -@deprecated("Use TriePhraseLexicon instead", "Before 10/1/15") -class ChainPhraseLexicon(val name:String, val tokenizer:StringSegmenter = cc.factorie.app.strings.nonWhitespaceSegmenter, val lemmatizer:Lemmatizer = LowercaseLemmatizer) extends MutableLexicon { - // The next two constructors are there just to support legacy usage, and should ultimately be removed. - /** Populate lexicon from file, with one entry per line, consisting of space-separated tokens. */ - def this(file:File) = { this(file.toString, cc.factorie.app.strings.nonWhitespaceSegmenter, LowercaseLemmatizer); this.++=(Source.fromFile(file)(scala.io.Codec.UTF8)) } - //def this(caseSensitive:Boolean) = this(lemmatizer = if (caseSensitive) LowercaseLemmatizer else NoopLemmatizer) - - class LexiconToken extends Observation[LexiconToken] { - def string: String = throw new Error("string unknown; in key only.") - def next: LexiconToken = null - def next_=(lt:LexiconToken): Unit = throw new Error - def prev_=(lt:LexiconToken): Unit = throw new Error - def prev: LexiconToken = null - def hasNext = false - def hasPrev = false - def position = 0 - def lengthToEnd = 1 - } - object LexiconToken extends LexiconToken // Used to efficiently represent single-word lexicon entries - class LexiconPhraseToken(override val string:String) extends LexiconToken { - override var next: LexiconToken = null - override var prev: LexiconToken = null - override def hasNext = next != null - override def hasPrev = prev != null - override def position = lengthToEnd - override def lengthToEnd: Int = if (next == null) 1 else 1 + next.lengthToEnd - } - private def newLexiconTokens(words:Seq[String]): Seq[LexiconPhraseToken] = { - val result = new ArrayBuffer[LexiconPhraseToken] - var t: LexiconPhraseToken = null - for (word <- words) { - val t2 = new LexiconPhraseToken(word) - t2.prev = t - if (t != null) t.next = t2 - t = t2 - result += t2 - } - result - } - val contents = new HashMap[String,List[LexiconToken]] - private def +=(t:LexiconPhraseToken): Unit = { - val key = lemmatizer.lemmatize(t.string) - val old: List[LexiconToken] = contents.getOrElse(key, Nil) - contents(key) = t :: old - } - /** Add a new lexicon entry consisting of one or more words. The Lexicon's tokenizer will be used to split the string, if possible. */ - def +=(phrase:String): Unit = { - val words: Seq[String] = tokenizer(phrase).toSeq - if (words.length == 1) { - val word = words.head - val key = lemmatizer.lemmatize(word) - val old: List[LexiconToken] = contents.getOrElse(key, Nil) - contents(key) = LexiconToken :: old - } else { - ChainPhraseLexicon.this += newLexiconTokens(words.map(lemmatizer.lemmatize(_))) - } - } - private def +=(ts:Seq[LexiconPhraseToken]): Unit = { - //println("Lexicon adding "+ts.map(_.word)) - ts.foreach(t => ChainPhraseLexicon.this += t) - } - /** Add a new lexicon entry consisting of a multi-string phrase. */ - //def +=(ws:Seq[String]): Unit = this.+=(newLexiconTokens(ws.map(lemmatizer.lemmatize(_)))) - //def ++=(source:Source): Unit = for (line <- source.getLines()) yield { PhraseLexicon.this.+=(line); /*println("TokenSeqs.Lexicon adding "+line)*/ } - /** String contains multiple newline-separated lexicon entries */ - //def ++=(phrases:String): Unit = ++=(Source.fromString(phrases)) - //def ++=(file:File, enc:String = "UTF-8"): Unit = ++=(Source.fromFile(file, enc)) - def phrases: Seq[String] = { - def phrase(entry:LexiconToken): String = if (entry.hasNext) entry.string + " " + phrase(entry.next) else entry.string - val result = new ArrayBuffer[String] - for (key <- contents.keys; entry <- contents(key)) { - if (entry eq LexiconToken) result += key - else result += phrase(entry.chainHead) - } - result.distinct - } - /** Do any of the Lexicon entries contain the given word string. */ - def containsLemmatizedWord(word:String): Boolean = contents.contains(word) - def containsLemmatizedWords(words: Seq[String]): Boolean = newLexiconTokens(words).nonEmpty && contains(newLexiconTokens(words).head.asInstanceOf[Observation[LexiconToken]]) - def contains[T<:Observation[T]](query:Seq[T]): Boolean = { - val queryToken = query.head - val entries = contents.getOrElse(lemmatizer.lemmatize(queryToken.string), Nil) - for (entry <- entries) { - if (entry eq LexiconToken) return true // The lexicon entry is a single word, indicated just by the presence (keyString, object LexiconToken) - var te: LexiconToken = entry - var tq = queryToken - var result = true - // Check for match all the way to the end of this lexicon entry - do { - if (te.string != lemmatizer.lemmatize(tq.string)) result = false - //if ((!caseSensitive && te.string != tq.string.toLowerCase) || (caseSensitive && te.string != tq.string)) result = false - te = te.next; tq = tq.next - } while (te != null && tq != null && result) - if (result && te == null) { - //print(" contains length="+entry.length+" "+entry.seq.map(_.word).toList) - return true - } - } - false - } - - /** Is 'query' in the lexicon, accounting for lexicon phrases and the context of 'query' */ - def contains[T<:Observation[T]](query:T): Boolean = { - //println("contains "+query.word+" "+query.hasPrev+" "+query) - val entries = contents.getOrElse(lemmatizer.lemmatize(query.string), Nil) - for (entry <- entries) { - if (entry eq LexiconToken) return true // The lexicon entry is a single word, indicated just by the presence (keyString, object LexiconToken) - var te: LexiconToken = entry - var tq = query - var result = true - // Go the beginning of this lexicon entry - while (te.hasPrev && result) { - if (!tq.hasPrev) return false - te = te.prev; tq = tq.prev - } - //println(" Trying "+query.word+" "+entry.seq.map(_.word).toList) - // Check for match all the way to the end of this lexicon entry - do { - if (te.string != lemmatizer.lemmatize(tq.string)) result = false - //if ((!caseSensitive && te.string != tq.string.toLowerCase) || (caseSensitive && te.string != tq.string)) result = false - te = te.next; tq = tq.next - } while (te != null && tq != null && result) - if (result && te == null) { - //print(" contains length="+entry.length+" "+entry.seq.map(_.word).toList) - return true - } - } - false - } - /** Is 'query' in the lexicon, ignoring context. */ - def containsSingle[T<:Observation[T]](query:T): Boolean = contents.contains(lemmatizer.lemmatize(query.string)) - - // TODO this method seems to be broken -KS - /** Return length of match, or -1 if no match. */ - def startsAt[T<:Observation[T]](query:T): Int = { - val key = lemmatizer.lemmatize(query.string) - val entries = contents.getOrElse(key, Nil) - for (entry <- entries.filter(_.hasPrev == false).sortBy(entry => -entry.lengthToEnd)) { // Sort so that we look for long entries first - var te = entry - var tq = query - var len = 0 - var found = true - // Query must be at the the beginning of this lexicon entry - // Check for match all the way to the end of this lexicon entry - do { - // accessing te.string throws an Error - if (te.string != lemmatizer.lemmatize(tq.string)) found = false - //if ((!caseSensitive && te.string != tq.string.toLowerCase) || (caseSensitive && te.string != tq.string)) found = false - len += 1 - te = te.next; tq = tq.next - } while (te != null && tq != null && found) - if (found && te == null) { - //print(" contains length="+entry.length+" "+entry.seq.map(_.word).toList) - return len - } - } - -1 - } -} - diff --git a/src/main/scala/cc/factorie/app/nlp/lexicon/LexiconMention.scala b/src/main/scala/cc/factorie/app/nlp/lexicon/LexiconMention.scala new file mode 100644 index 0000000..4d93429 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/lexicon/LexiconMention.scala @@ -0,0 +1,33 @@ +package cc.factorie.app.nlp.lexicon + +import java.io.Serializable + +/** + * An Aho-Corasick mention, containing the mention string, and the start & end + * character indices in the original text. + */ +class LexiconMention(val mention : String, val startIdx : Int, val endIdx : Int) extends Serializable { + override def toString() : String = { "Mention: " + mention + ", startIdx = " + startIdx + ", endIdx = " + endIdx } + + override def hashCode() : Int = { mention.hashCode() ^ startIdx ^ endIdx } + + override def equals(obj : Any) : Boolean = { + if (obj == null) { + return false + } + if (getClass() != obj.getClass()) { + return false + } + val other = obj.asInstanceOf[LexiconMention] + if (!this.mention.equals(other.mention)) { + return false + } + if (this.startIdx != other.startIdx) { + return false + } + if (this.endIdx != other.endIdx) { + return false + } + return true + } +} diff --git a/src/main/scala/cc/factorie/app/nlp/lexicon/Lexicons.scala b/src/main/scala/cc/factorie/app/nlp/lexicon/Lexicons.scala deleted file mode 100644 index 2d308cb..0000000 --- a/src/main/scala/cc/factorie/app/nlp/lexicon/Lexicons.scala +++ /dev/null @@ -1,264 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.app.nlp.lexicon -import cc.factorie.app.nlp.lemma._ -import cc.factorie.app.strings._ - -object NumberWords extends PhraseLexicon("NumberWords", nonWhitespaceClassesSegmenter, LowercaseLemmatizer) { - this ++= -"""zero -one -two -three -four -five -six -seven -eight -nine -ten -tens -dozen -dozens -eleven -twelve -thirteen -fourteen -fifteen -sixteen -seventeen -eighteen -nineteen -twenty -thirty -forty -fifty -sixty -seventy -eighty -ninety -hundred -hundreds -thousand -thousands -million -millions -billion -billions -trillion -trillions -quadrillion -quintillion -sextillion -septillion -zillion -umpteen -multimillion -multibillion -""" -} - - -object Determiner extends PhraseLexicon("Determiner", nonWhitespaceClassesSegmenter, LowercaseLemmatizer) { - this ++= -"""the -a -this -an -that -some -all -these -no -any -those -another -both -each -every -either -neither -""" -} - -object Pronoun extends PhraseLexicon("Pronoun", nonWhitespaceClassesSegmenter, LowercaseLemmatizer) { - this ++= -"""all -another -any -anybody -anyone -anything -both -each -each other -either -everybody -everyone -everything -few -he -her -hers -herself -him -himself -his -I -it -its -itself -many -me -mine -myself -neither -no_one -nobody -none -nothing -one -one another -other -ours -ourselves -several -she -some -somebody -someone -something -such -that -theirs -them -themselves -these -they -this -those -us -we -what -whatever -which -whichever -who -whoever -whom -whomever -whose -you -yours -yourself -yourselves""" -} - -object PersonPronoun extends PhraseLexicon("PersonPronoun", nonWhitespaceClassesSegmenter, LowercaseLemmatizer) { - this ++= -"""anybody -anyone -everybody -everyone -he -her -hers -herself -him -himself -his -I -me -mine -myself -nobody -ours -ourselves -she -somebody -someone -theirs -them -themselves -they -us -we -who -whoever -whom -whomever -whose -you -yours -yourself -yourselves""" -} - -object PosessiveDeterminer extends PhraseLexicon("PosessiveDeterminer", nonWhitespaceClassesSegmenter, LowercaseLemmatizer) { - this ++= -"""my -your -his -her -its -their""" -} - -/** A non-exhaustive list of common English prepositions. */ -object Preposition extends PhraseLexicon("Preposition", nonWhitespaceClassesSegmenter, LowercaseLemmatizer) { - this ++= -"""about -above -across -after -against -around -at -before -behind -below -beneath -beside -besides -between -beyond -by -down -during -except -for -from -in -inside -into -like -near -of -off -on -out -outside -over -since -through -throughout -till -to -toward -under -until -up -upon -with -without""" -} diff --git a/src/main/scala/cc/factorie/app/nlp/lexicon/LexiconsProvider.scala b/src/main/scala/cc/factorie/app/nlp/lexicon/LexiconsProvider.scala new file mode 100644 index 0000000..41c5b63 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/lexicon/LexiconsProvider.scala @@ -0,0 +1,88 @@ +package cc.factorie.app.nlp.lexicon + +import java.io.{File, InputStream} +import java.net.URL +import java.nio.file.Path + +import cc.factorie.util.{ClasspathURL, ModelProvider} + +import scala.reflect.{ClassTag, classTag} +import scala.util.Try + +trait LexiconsProvider { + def lexiconRoot:String + implicit def provide[L : ClassTag]:ModelProvider[L] +} + +object LexiconsProvider { + import cc.factorie.util.ISAble._ + + private def lexiconNamePieces[L:ClassTag]:Seq[String] = { + val arr = classTag[L].runtimeClass.getName.split("""\.""").map(_.stripSuffix("$")) + val fileName = arr.last.zipWithIndex.flatMap { + case (u, 0) => u.toLower.toString + case (u, _) if u.isUpper => "-" + u.toLower + case (l, _) => l.toString + }.mkString("") + ".txt" + arr.init.map(_.toLowerCase) ++ Seq(fileName) + } + + private def fullLexiconName[L:ClassTag] = lexiconNamePieces[L].mkString("/") + private def shortLexiconName[L:ClassTag] = lexiconNamePieces[L].drop(5).mkString("/") + + + def fromFile(f:File, useFullPath:Boolean = false):LexiconsProvider = new LexiconsProvider { + lazy val lexiconRoot = f.getAbsolutePath + override implicit def provide[L : ClassTag]: ModelProvider[L] = new ModelProvider[L] { + private val path = f.toPath.resolve(if(useFullPath) fullLexiconName[L] else shortLexiconName[L]) + val coordinates = path.toString + val provide:InputStream = buffered(path) + } + } + + def fromUrl(u:URL, useFullPath:Boolean = false):LexiconsProvider = new LexiconsProvider { + lazy val lexiconRoot = u.toString + implicit def provide[L:ClassTag]: ModelProvider[L] = new ModelProvider[L] { + private val modelUrl = new URL(u, if(useFullPath) fullLexiconName[L] else shortLexiconName[L]) + val provide: InputStream = buffered(modelUrl) + val coordinates: String = modelUrl.toString + } + } + + implicit def providePath(p:Path):LexiconsProvider = fromFile(p.toFile, false) + implicit def provideFile(f:File):LexiconsProvider = fromFile(f,false) + implicit def provideURL(u:URL):LexiconsProvider = fromUrl(u, false) + + def fromString(s:String, useFullPath:Boolean=false):LexiconsProvider = s match { + case cp if cp.toLowerCase == "classpath" => classpath(useFullPath) + case urlS if Try(new URL(urlS)).isSuccess => fromUrl(new URL(urlS), useFullPath) + case p => fromFile(new File(p), useFullPath) + } + + @deprecated("This exists to preserve legacy functionality", "10/27/15") + def classpath(useFullPath:Boolean=true):LexiconsProvider = new LexiconsProvider { + def lexiconRoot = "classpath" + implicit def provide[L: ClassTag]: ModelProvider[L] = new ModelProvider[L] { + private def url = if(useFullPath) ClasspathURL.fromDirectory[Lexicon](shortLexiconName[L]) else this.getClass.getResource("/" + shortLexiconName[L]) + def coordinates: String = url.toString + def provide: InputStream = url + } + } + + /* + @deprecated("This exists to preserve legacy functionality", "10/05/15") + def classpath:LexiconsProvider = new LexiconsProvider { + //lazy val lexiconRoot = ClasspathURL.fromDirectory[Lexicon]("") + lazy val lexiconRoot = Lexicon.getClass.getResource("") + implicit def provide[L : ClassTag]: ModelProvider[L] = new ModelProvider[L] { + private val url = { + println("root " + lexiconRoot) + println("shortname" + shortLexiconName[L]) + new URL(lexiconRoot, shortLexiconName[L]) + } + val coordinates: String = url.toString + val provide: InputStream = buffered(url) + } + } + */ +} \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/nlp/lexicon/MutableLexicon.scala b/src/main/scala/cc/factorie/app/nlp/lexicon/MutableLexicon.scala new file mode 100644 index 0000000..f95f0a5 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/lexicon/MutableLexicon.scala @@ -0,0 +1,19 @@ +package cc.factorie.app.nlp.lexicon + +import java.io.{File, InputStream} + +import scala.io.{Codec, Source} + +trait MutableLexicon extends Lexicon { + // For populating the lexicon + /** Tokenize and lemmatize the input String and add it as a single entry to the Lexicon */ + def +=(phrase:String): Unit + /** All a lines from the input Source to this lexicon. Source is assumed to contain multiple newline-separated lexicon entries */ + def ++=(source:Source): this.type = { for (line <- source.getLines()) { val phrase = line.trim; if (phrase.length > 0 && !phrase.startsWith("#")) MutableLexicon.this.+=(phrase) }; source.close(); this } + /** All a lines from the input String to this lexicon. String contains multiple newline-separated lexicon entries */ + def ++=(phrases:String): this.type = ++=(Source.fromString(phrases)) + /** All a lines from the input File to this lexicon. File contains multiple newline-separated lexicon entries */ + def ++=(file:File, enc:String = "UTF-8"): this.type = ++=(Source.fromFile(file, enc)) + /** Add all lines from the InputStream to this lexicon */ + def ++=(is:InputStream): this.type = this.++=(Source.fromInputStream(is)(Codec.UTF8)) +} diff --git a/src/main/scala/cc/factorie/app/nlp/lexicon/NumberWords.scala b/src/main/scala/cc/factorie/app/nlp/lexicon/NumberWords.scala new file mode 100644 index 0000000..d0ef5b3 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/lexicon/NumberWords.scala @@ -0,0 +1,62 @@ +package cc.factorie.app.nlp.lexicon + +import cc.factorie.app.nlp.lemma.LowercaseLemmatizer +import cc.factorie.app.strings.nonWhitespaceClassesSegmenter + +/** + * Created by andrew@andrewresearch.net on 28/10/17. + */ + +object NumberWords extends PhraseLexicon("NumberWords", nonWhitespaceClassesSegmenter, LowercaseLemmatizer) { + this ++= + """zero +one +two +three +four +five +six +seven +eight +nine +ten +tens +dozen +dozens +eleven +twelve +thirteen +fourteen +fifteen +sixteen +seventeen +eighteen +nineteen +twenty +thirty +forty +fifty +sixty +seventy +eighty +ninety +hundred +hundreds +thousand +thousands +million +millions +billion +billions +trillion +trillions +quadrillion +quintillion +sextillion +septillion +zillion +umpteen +multimillion +multibillion +""" +} diff --git a/src/main/scala/cc/factorie/app/nlp/lexicon/PersonPronoun.scala b/src/main/scala/cc/factorie/app/nlp/lexicon/PersonPronoun.scala new file mode 100644 index 0000000..ae476d9 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/lexicon/PersonPronoun.scala @@ -0,0 +1,44 @@ +package cc.factorie.app.nlp.lexicon + +import cc.factorie.app.nlp.lemma.LowercaseLemmatizer +import cc.factorie.app.strings.nonWhitespaceClassesSegmenter + +object PersonPronoun extends PhraseLexicon("PersonPronoun", nonWhitespaceClassesSegmenter, LowercaseLemmatizer) { + this ++= + """anybody +anyone +everybody +everyone +he +her +hers +herself +him +himself +his +I +me +mine +myself +nobody +ours +ourselves +she +somebody +someone +theirs +them +themselves +they +us +we +who +whoever +whom +whomever +whose +you +yours +yourself +yourselves""" +} diff --git a/src/main/scala/cc/factorie/app/nlp/lexicon/PhraseLexicon.scala b/src/main/scala/cc/factorie/app/nlp/lexicon/PhraseLexicon.scala new file mode 100644 index 0000000..4e82f9b --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/lexicon/PhraseLexicon.scala @@ -0,0 +1,57 @@ +package cc.factorie.app.nlp.lexicon + +import java.io.File + +import cc.factorie.app.chain.Observation +import cc.factorie.app.nlp.lemma.{Lemmatizer, LowercaseLemmatizer} +import cc.factorie.app.strings.StringSegmenter + +import scala.io.Source + +/** A lexicon containing single words or multi-word phrases. + * + * @author Kate Silverstein + */ +@deprecated("Use TriePhraseLexicon instead", "Before 10/1/15") +class PhraseLexicon(val name: String, val tokenizer: StringSegmenter = cc.factorie.app.strings.nonWhitespaceSegmenter, val lemmatizer: Lemmatizer = LowercaseLemmatizer) extends MutableLexicon { + def this(file: File) = { this(file.toString, cc.factorie.app.strings.nonWhitespaceSegmenter, LowercaseLemmatizer); this.++=(Source.fromFile(file)(scala.io.Codec.UTF8))} + val wordTree = new SuffixTree(false) + def +=(phrase:String): Unit = { + val words: Seq[String] = tokenizer(phrase).toSeq + wordTree.add(words.map(lemmatizer.lemmatize(_))) + } + /** Checks whether the lexicon contains this already-lemmatized/tokenized single word */ + def containsLemmatizedWord(word: String): Boolean = { + containsLemmatizedWords(List(word).toSeq) + } + /** Checks whether the lexicon contains this already-lemmatized/tokenized phrase, where 'words' can either be + * single word or a multi-word expression. */ + def containsLemmatizedWords(words: Seq[String]): Boolean = { + wordTree.contains(words) + } + /** Tokenizes and lemmatizes the string of each entry in 'query', then checks if the sequence is in the lexicon*/ + def contains[T<:Observation[T]](query: Seq[T]): Boolean = { + val strings = query.map(_.string) + val tokenized = strings.flatMap(tokenizer(_)) + val lemmatized = tokenized.map(lemmatizer.lemmatize(_)).toSeq + containsLemmatizedWords(lemmatized) + } + /** Tokenizes and lemmatizes query.string, then checks if the sequence is in the lexicon */ + def contains[T<:Observation[T]](query: T): Boolean = { + val tokenized = tokenizer(query.string).toSeq + val lemmatized = tokenized.map(lemmatizer.lemmatize(_)) + containsLemmatizedWords(lemmatized) + } + override def toString(): String = { "" } + + /** Return length of match, or -1 if no match. */ + def startsAt[T<:Observation[T]](query:T): Int = { + if (contains(query)){ + val tokenized = tokenizer(query.string).toSeq + val lemmatized = tokenized.map(lemmatizer.lemmatize(_)) + return wordTree.getSuffixIndex(lemmatized, true) + } + -1 + } +} + diff --git a/src/main/scala/cc/factorie/app/nlp/lexicon/PosessiveDeterminer.scala b/src/main/scala/cc/factorie/app/nlp/lexicon/PosessiveDeterminer.scala new file mode 100644 index 0000000..4bf7422 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/lexicon/PosessiveDeterminer.scala @@ -0,0 +1,14 @@ +package cc.factorie.app.nlp.lexicon + +import cc.factorie.app.nlp.lemma.LowercaseLemmatizer +import cc.factorie.app.strings.nonWhitespaceClassesSegmenter + +object PosessiveDeterminer extends PhraseLexicon("PosessiveDeterminer", nonWhitespaceClassesSegmenter, LowercaseLemmatizer) { + this ++= + """my +your +his +her +its +their""" +} diff --git a/src/main/scala/cc/factorie/app/nlp/lexicon/Preposition.scala b/src/main/scala/cc/factorie/app/nlp/lexicon/Preposition.scala new file mode 100644 index 0000000..8c0ef69 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/lexicon/Preposition.scala @@ -0,0 +1,53 @@ +package cc.factorie.app.nlp.lexicon + +import cc.factorie.app.nlp.lemma.LowercaseLemmatizer +import cc.factorie.app.strings.nonWhitespaceClassesSegmenter + +/** A non-exhaustive list of common English prepositions. */ +object Preposition extends PhraseLexicon("Preposition", nonWhitespaceClassesSegmenter, LowercaseLemmatizer) { + this ++= + """about +above +across +after +against +around +at +before +behind +below +beneath +beside +besides +between +beyond +by +down +during +except +for +from +in +inside +into +like +near +of +off +on +out +outside +over +since +through +throughout +till +to +toward +under +until +up +upon +with +without""" +} \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/nlp/lexicon/Pronoun.scala b/src/main/scala/cc/factorie/app/nlp/lexicon/Pronoun.scala new file mode 100644 index 0000000..18106b0 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/lexicon/Pronoun.scala @@ -0,0 +1,77 @@ +package cc.factorie.app.nlp.lexicon + +import cc.factorie.app.nlp.lemma.LowercaseLemmatizer +import cc.factorie.app.strings.nonWhitespaceClassesSegmenter + +object Pronoun extends PhraseLexicon("Pronoun", nonWhitespaceClassesSegmenter, LowercaseLemmatizer) { + this ++= + """all +another +any +anybody +anyone +anything +both +each +each other +either +everybody +everyone +everything +few +he +her +hers +herself +him +himself +his +I +it +its +itself +many +me +mine +myself +neither +no_one +nobody +none +nothing +one +one another +other +ours +ourselves +several +she +some +somebody +someone +something +such +that +theirs +them +themselves +these +they +this +those +us +we +what +whatever +which +whichever +who +whoever +whom +whomever +whose +you +yours +yourself +yourselves""" +} \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/nlp/lexicon/ProvidedLexicon.scala b/src/main/scala/cc/factorie/app/nlp/lexicon/ProvidedLexicon.scala new file mode 100644 index 0000000..8075195 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/lexicon/ProvidedLexicon.scala @@ -0,0 +1,13 @@ +package cc.factorie.app.nlp.lexicon + +import cc.factorie.util.ModelProvider + +trait ProvidedLexicon[L] { + this: MutableLexicon => + + def provider:ModelProvider[L] + + synchronized { + this.++=(provider.provide) + } +} \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/nlp/lexicon/ProvidedTriePhraseLexicon.scala b/src/main/scala/cc/factorie/app/nlp/lexicon/ProvidedTriePhraseLexicon.scala new file mode 100644 index 0000000..04a1be6 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/lexicon/ProvidedTriePhraseLexicon.scala @@ -0,0 +1,7 @@ +package cc.factorie.app.nlp.lexicon + +import cc.factorie.util.ModelProvider + +import scala.reflect.ClassTag + +class ProvidedTriePhraseLexicon[L]()(implicit val provider:ModelProvider[L], ct:ClassTag[L]) extends TriePhraseLexicon(ct.runtimeClass.getName) with ProvidedLexicon[L] \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/nlp/lexicon/StaticLexicons.scala b/src/main/scala/cc/factorie/app/nlp/lexicon/StaticLexicons.scala index 2c34321..f2519b2 100644 --- a/src/main/scala/cc/factorie/app/nlp/lexicon/StaticLexicons.scala +++ b/src/main/scala/cc/factorie/app/nlp/lexicon/StaticLexicons.scala @@ -1,123 +1,6 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ package cc.factorie.app.nlp.lexicon -import java.net.URL -import java.nio.file.{Paths, Files, Path} - -import cc.factorie.app.nlp.lexicon.{iesl => Iesl, uscensus => Uscensus, wikipedia => Wikipedia, ssdi => Ssdi, mandarin => Mandarin} -import cc.factorie.app.strings.StringSegmenter -import cc.factorie.app.nlp.lemma.{Lemmatizer,LowercaseLemmatizer} -import java.io.{InputStream, File} -import cc.factorie.util.{ModelProvider, ClasspathURL} - -import scala.reflect.{ClassTag, classTag} -import scala.language.implicitConversions -import scala.util.Try - -trait LexiconsProvider { - def lexiconRoot:String - implicit def provide[L : ClassTag]:ModelProvider[L] -} - -object LexiconsProvider { - import cc.factorie.util.ISAble._ - - private def lexiconNamePieces[L:ClassTag]:Seq[String] = { - val arr = classTag[L].runtimeClass.getName.split("""\.""").map(_.stripSuffix("$")) - val fileName = arr.last.zipWithIndex.flatMap { - case (u, 0) => u.toLower.toString - case (u, _) if u.isUpper => "-" + u.toLower - case (l, _) => l.toString - }.mkString("") + ".txt" - arr.init.map(_.toLowerCase) ++ Seq(fileName) - } - - private def fullLexiconName[L:ClassTag] = lexiconNamePieces[L].mkString("/") - private def shortLexiconName[L:ClassTag] = lexiconNamePieces[L].drop(5).mkString("/") - - - def fromFile(f:File, useFullPath:Boolean = false):LexiconsProvider = new LexiconsProvider { - lazy val lexiconRoot = f.getAbsolutePath - override implicit def provide[L : ClassTag]: ModelProvider[L] = new ModelProvider[L] { - private val path = f.toPath.resolve(if(useFullPath) fullLexiconName[L] else shortLexiconName[L]) - val coordinates = path.toString - val provide:InputStream = buffered(path) - } - } - - def fromUrl(u:URL, useFullPath:Boolean = false):LexiconsProvider = new LexiconsProvider { - lazy val lexiconRoot = u.toString - implicit def provide[L:ClassTag]: ModelProvider[L] = new ModelProvider[L] { - private val modelUrl = new URL(u, if(useFullPath) fullLexiconName[L] else shortLexiconName[L]) - val provide: InputStream = buffered(modelUrl) - val coordinates: String = modelUrl.toString - } - } - - implicit def providePath(p:Path):LexiconsProvider = fromFile(p.toFile, false) - implicit def provideFile(f:File):LexiconsProvider = fromFile(f,false) - implicit def provideURL(u:URL):LexiconsProvider = fromUrl(u, false) - - def fromString(s:String, useFullPath:Boolean=false):LexiconsProvider = s match { - case cp if cp.toLowerCase == "classpath" => classpath(useFullPath) - case urlS if Try(new URL(urlS)).isSuccess => fromUrl(new URL(urlS), useFullPath) - case p => fromFile(new File(p), useFullPath) - } - - @deprecated("This exists to preserve legacy functionality", "10/27/15") - def classpath(useFullPath:Boolean=true):LexiconsProvider = new LexiconsProvider { - def lexiconRoot = "classpath" - implicit def provide[L: ClassTag]: ModelProvider[L] = new ModelProvider[L] { - private def url = if(useFullPath) ClasspathURL.fromDirectory[Lexicon](shortLexiconName[L]) else this.getClass.getResource("/" + shortLexiconName[L]) - def coordinates: String = url.toString - def provide: InputStream = url - } - } - - /* - @deprecated("This exists to preserve legacy functionality", "10/05/15") - def classpath:LexiconsProvider = new LexiconsProvider { - //lazy val lexiconRoot = ClasspathURL.fromDirectory[Lexicon]("") - lazy val lexiconRoot = Lexicon.getClass.getResource("") - implicit def provide[L : ClassTag]: ModelProvider[L] = new ModelProvider[L] { - private val url = { - println("root " + lexiconRoot) - println("shortname" + shortLexiconName[L]) - new URL(lexiconRoot, shortLexiconName[L]) - } - val coordinates: String = url.toString - val provide: InputStream = buffered(url) - } - } - */ -} - - - -trait ProvidedLexicon[L] { - this: MutableLexicon => - - def provider:ModelProvider[L] - - synchronized { - this.++=(provider.provide) - } -} - -class ProvidedTriePhraseLexicon[L]()(implicit val provider:ModelProvider[L], ct:ClassTag[L]) extends TriePhraseLexicon(ct.runtimeClass.getName) with ProvidedLexicon[L] - -class GenericLexicon(name:String, val provider:ModelProvider[GenericLexicon]) extends TriePhraseLexicon(name) with ProvidedLexicon[GenericLexicon] +import cc.factorie.app.nlp.lexicon.{iesl => Iesl, ssdi => Ssdi, uscensus => Uscensus, wikipedia => Wikipedia} class StaticLexicons()(implicit lp:LexiconsProvider) { @@ -126,25 +9,45 @@ class StaticLexicons()(implicit lp:LexiconsProvider) { object iesl { object Continents extends Iesl.Continents()(lp.provide[Iesl.Continents]) + object Country extends Iesl.Country()(lp.provide[Iesl.Country]) + object City extends Iesl.City()(lp.provide[Iesl.City]) + object UsState extends Iesl.UsState()(lp.provide[Iesl.UsState]) + object PlaceSuffix extends Iesl.PlaceSuffix()(lp.provide[Iesl.PlaceSuffix]) + object JobTitle extends Iesl.JobTitle()(lp.provide[Iesl.JobTitle]) + object Money extends Iesl.Money()(lp.provide[Iesl.Money]) + object Company extends Iesl.Company()(lp.provide[Iesl.Company]) + object OrgSuffix extends Iesl.OrgSuffix()(lp.provide[Iesl.OrgSuffix]) + object Month extends Iesl.Month()(lp.provide[Iesl.Month]) + object Day extends Iesl.Day()(lp.provide[Iesl.Day]) + object PersonHonorific extends Iesl.PersonHonorific()(lp.provide[Iesl.PersonHonorific]) + object PersonFirstHighest extends Iesl.PersonFirstHighest()(lp.provide[Iesl.PersonFirstHighest]) + object PersonFirstHigh extends Iesl.PersonFirstHigh()(lp.provide[Iesl.PersonFirstHigh]) + object PersonFirstMedium extends Iesl.PersonFirstMedium()(lp.provide[Iesl.PersonFirstMedium]) + object PersonLastHighest extends Iesl.PersonLastHighest()(lp.provide[Iesl.PersonLastHighest]) + object PersonLastHigh extends Iesl.PersonLastHigh()(lp.provide[Iesl.PersonLastHigh]) + object PersonLastMedium extends Iesl.PersonLastMedium()(lp.provide[Iesl.PersonLastMedium]) + object Say extends Iesl.Say()(lp.provide[Iesl.Say]) + object Demonym extends Iesl.Demonym()(lp.provide[Iesl.Demonym]) + object DemonymMap extends Iesl.DemonymMap()(lp.provide[Iesl.Demonym]) object AllPlaces extends TrieUnionLexicon("places", Continents, Country, City, UsState) @@ -155,20 +58,6 @@ class StaticLexicons()(implicit lp:LexiconsProvider) { } - object ssdi { - object PersonFirstHighest extends Ssdi.PersonFirstHighest()(lp.provide[Ssdi.PersonFirstHighest]) - object PersonFirstHigh extends Ssdi.PersonFirstHigh()(lp.provide[Ssdi.PersonFirstHigh]) - object PersonFirstMedium extends Ssdi.PersonFirstMedium()(lp.provide[Ssdi.PersonFirstMedium]) - object PersonLastHighest extends Ssdi.PersonLastHighest()(lp.provide[Ssdi.PersonLastHighest]) - object PersonLastHigh extends Ssdi.PersonLastHigh()(lp.provide[Ssdi.PersonLastHigh]) - object PersonLastMedium extends Ssdi.PersonLastMedium()(lp.provide[Ssdi.PersonLastMedium]) - - object PersonFirst extends TrieUnionLexicon("person-first", PersonFirstHighest, PersonFirstHigh, PersonFirstMedium) - - object PersonLast extends TrieUnionLexicon("person-last", PersonLastHighest, PersonLastHigh, PersonLastMedium) - - } - object uscensus { object PersonFirstFemale extends Uscensus.PersonFirstFemale()(lp.provide[Uscensus.PersonFirstFemale]) @@ -225,45 +114,18 @@ class StaticLexicons()(implicit lp:LexiconsProvider) { } - object mandarin { - object SurnamePinyin extends Mandarin.SurnamePinyin()(lp.provide[Mandarin.SurnamePinyin]) - object GivenNamePinyin extends Mandarin.GivenNamePinyin()(lp.provide[Mandarin.GivenNamePinyin]) + object ssdi { + object PersonFirstHighest extends Ssdi.PersonFirstHighest()(lp.provide[Ssdi.PersonFirstHighest]) + object PersonFirstHigh extends Ssdi.PersonFirstHigh()(lp.provide[Ssdi.PersonFirstHigh]) + object PersonFirstMedium extends Ssdi.PersonFirstMedium()(lp.provide[Ssdi.PersonFirstMedium]) + object PersonLastHighest extends Ssdi.PersonLastHighest()(lp.provide[Ssdi.PersonLastHighest]) + object PersonLastHigh extends Ssdi.PersonLastHigh()(lp.provide[Ssdi.PersonLastHigh]) + object PersonLastMedium extends Ssdi.PersonLastMedium()(lp.provide[Ssdi.PersonLastMedium]) + + object PersonFirst extends TrieUnionLexicon("person-first", PersonFirstHighest, PersonFirstHigh, PersonFirstMedium) + + object PersonLast extends TrieUnionLexicon("person-last", PersonLastHighest, PersonLastHigh, PersonLastMedium) + } - - object spanish { - - object Continents extends Iesl.es.Continents()(lp.provide[Iesl.es.Continents]) - object Day extends Iesl.es.Day()(lp.provide[Iesl.es.Day]) - object Month extends Iesl.es.Month()(lp.provide[Iesl.es.Month]) - object PersonFirst extends Iesl.es.PersonFirst()(lp.provide[Iesl.es.PersonFirst]) - object PersonLast extends Iesl.es.PersonLast()(lp.provide[Iesl.es.PersonLast]) - object Location extends Iesl.es.Location()(lp.provide[Iesl.es.Location]) - object Miscellaneous extends Iesl.es.Miscellaneous()(lp.provide[Iesl.es.Miscellaneous]) - object Person extends Iesl.es.Person()(lp.provide[Iesl.es.Person]) - object Organization extends Iesl.es.Organization()(lp.provide[Iesl.es.Organization]) - object PersonHonorific extends Iesl.es.PersonHonorific()(lp.provide[Iesl.es.PersonHonorific]) - object OrgSuffix extends Iesl.es.OrgSuffix()(lp.provide[Iesl.es.OrgSuffix]) - object Demonym extends Iesl.es.Demonym()(lp.provide[Iesl.es.Demonym]) - - - - object WikiBook extends Wikipedia.es.Book()(lp.provide[Wikipedia.es.Book]) - object WikiFilm extends Wikipedia.es.Film()(lp.provide[Wikipedia.es.Film]) - object WikiEvent extends Wikipedia.es.Event()(lp.provide[Wikipedia.es.Event]) - object WikiBusiness extends Wikipedia.es.Business()(lp.provide[Wikipedia.es.Business]) - - object WikiLocation extends Wikipedia.es.Location()(lp.provide[Wikipedia.es.Location]) - object WikiLocationRedirect extends Wikipedia.es.LocationRedirect()(lp.provide[Wikipedia.es.LocationRedirect]) - object WikiLocationAndRedirect extends TrieUnionLexicon("es-location-and-redirect", WikiLocation, WikiLocationRedirect) - - object WikiPerson extends Wikipedia.es.Person()(lp.provide[Wikipedia.es.Person]) - object WikiPersonRedirect extends Wikipedia.es.PersonRedirect()(lp.provide[Wikipedia.es.PersonRedirect]) - object WikiPersonAndRedirect extends TrieUnionLexicon("es-person-and-redirect", WikiPerson, WikiPersonRedirect) - - object WikiOrganization extends Wikipedia.es.Organization()(lp.provide[Wikipedia.es.Organization]) - object WikiOrganizationRedirect extends Wikipedia.es.OrganizationRedirect()(lp.provide[Wikipedia.es.OrganizationRedirect]) - object WikiOrganizationAndRedirect extends TrieUnionLexicon("es-organization-and-redirect", WikiOrganization, WikiOrganizationRedirect) - } - -} +} \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/nlp/lexicon/StopWords.scala b/src/main/scala/cc/factorie/app/nlp/lexicon/StopWords.scala index b116322..cc8df7d 100644 --- a/src/main/scala/cc/factorie/app/nlp/lexicon/StopWords.scala +++ b/src/main/scala/cc/factorie/app/nlp/lexicon/StopWords.scala @@ -11,23 +11,11 @@ See the License for the specific language governing permissions and limitations under the License. */ package cc.factorie.app.nlp.lexicon -import cc.factorie.app.nlp.lemma._ + +import cc.factorie.app.nlp.lemma.LowercaseLemmatizer import cc.factorie.app.strings._ -class CustomStopWords extends TriePhraseLexicon("CustomStopWords", nonWhitespaceClassesSegmenter, LowercaseLemmatizer) { - def this(filename: String) = { - this() - this ++= scala.io.Source.fromFile(filename) - } - def this(words: Seq[String]) = { - this() - words.foreach { w => this += w } - } -} -object CustomStopWords { - def apply(filename: String) = new CustomStopWords(filename) -} object StopWords extends TriePhraseLexicon("StopWords", nonWhitespaceClassesSegmenter, LowercaseLemmatizer) { def addFromFilename(filename: String): Unit = { diff --git a/src/main/scala/cc/factorie/app/nlp/lexicon/SuffixNode.scala b/src/main/scala/cc/factorie/app/nlp/lexicon/SuffixNode.scala new file mode 100644 index 0000000..7f8d355 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/lexicon/SuffixNode.scala @@ -0,0 +1,20 @@ +package cc.factorie.app.nlp.lexicon + +import scala.collection.mutable.HashMap + +class SuffixNode { + var endState: Boolean = false + val contents = new HashMap[String, SuffixNode] + def get(s: String): SuffixNode = { contents.getOrElse(s, null) } + def put(s: String, n: SuffixNode): Unit = { contents.put(s, n) } + def setEndState(b: Boolean): Unit = {endState = b} + def isEndState: Boolean = endState + def contains(s: String): Boolean = contents.contains(s) + override def toString(): String = { + var st = "" + contents.keys.foreach(k => { + st += s"[ $k ] --> ${contents(k).toString()} \n" + }) + st + } +} diff --git a/src/main/scala/cc/factorie/app/nlp/lexicon/SuffixTree.scala b/src/main/scala/cc/factorie/app/nlp/lexicon/SuffixTree.scala index f981cfe..53ebc3f 100644 --- a/src/main/scala/cc/factorie/app/nlp/lexicon/SuffixTree.scala +++ b/src/main/scala/cc/factorie/app/nlp/lexicon/SuffixTree.scala @@ -12,8 +12,6 @@ limitations under the License. */ package cc.factorie.app.nlp.lexicon -import scala.collection.mutable.HashMap - /** * Created by kate on 5/20/14. */ @@ -75,19 +73,3 @@ class SuffixTree(prefix: Boolean) { } } -class SuffixNode { - var endState: Boolean = false - val contents = new HashMap[String, SuffixNode] - def get(s: String): SuffixNode = { contents.getOrElse(s, null) } - def put(s: String, n: SuffixNode): Unit = { contents.put(s, n) } - def setEndState(b: Boolean): Unit = {endState = b} - def isEndState: Boolean = endState - def contains(s: String): Boolean = contents.contains(s) - override def toString(): String = { - var st = "" - contents.keys.foreach(k => { - st += s"[ $k ] --> ${contents(k).toString()} \n" - }) - st - } -} \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/nlp/lexicon/TriePhraseLexicon.scala b/src/main/scala/cc/factorie/app/nlp/lexicon/TriePhraseLexicon.scala new file mode 100644 index 0000000..8e2c5d0 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/lexicon/TriePhraseLexicon.scala @@ -0,0 +1,73 @@ +package cc.factorie.app.nlp.lexicon + +import cc.factorie.app.chain.Observation +import cc.factorie.app.nlp.Token +import cc.factorie.app.nlp.lemma.{Lemmatizer, LowercaseLemmatizer} +import cc.factorie.app.strings.StringSegmenter +import cc.factorie.variable.CategoricalVectorVar + +import scala.io.Source + + +/** + * A phrase lexicon based on Aho-Corasick Trie lookups. + * Use the tag text methods in preference to the other methods, which are preserved for compatibility. + * The other methods have the same semantics as the PhraseLexicon, which return true iff the whole string is in the lexicon. + */ +class TriePhraseLexicon(val name: String, val tokenizer: StringSegmenter = cc.factorie.app.strings.nonWhitespaceSegmenter, val lemmatizer: Lemmatizer = LowercaseLemmatizer, val sep: String = " ") extends MutableLexicon { + val trie = new AhoCorasick(sep) + + def +=(phrase:String): Unit = synchronized { + val words: Seq[String] = tokenizer(phrase).toSeq + trie += words.map(lemmatizer.lemmatize) + } + + /** All a lines from the input Source to this lexicon. Source is assumed to contain multiple newline-separated lexicon entries. + * Overriden to call setTransitions after reading the file. + */ + override def ++=(source:Source): this.type = synchronized { for (line <- source.getLines()) { val phrase = line.trim; if (phrase.length > 0) TriePhraseLexicon.this.+=(phrase) }; trie.setTransitions(); source.close(); this } + + def setTransitions() : Unit = synchronized { trie.setTransitions() } + + /** Checks whether the lexicon contains this already-lemmatized/tokenized single word */ + def containsLemmatizedWord(word: String): Boolean = { containsLemmatizedWords(List(word).toSeq) } + + /** Checks whether the lexicon contains this already-lemmatized/tokenized phrase, where 'words' can either be + * single word or a multi-word expression. */ + def containsLemmatizedWords(words: Seq[String]): Boolean = { + trie.findExactMention(words) + } + + /** Tokenizes and lemmatizes the string of each entry in 'query', then checks if the exact sequence is in the lexicon*/ + def contains[T<:Observation[T]](query: Seq[T]): Boolean = { + val strings = query.map(_.string) + val tokenized = strings.flatMap(tokenizer(_)) + val lemmatized = tokenized.map(lemmatizer.lemmatize(_)).toSeq + containsLemmatizedWords(lemmatized) + } + + /** Tokenizes and lemmatizes query.string, then checks if the exact sequence is in the lexicon */ + def contains[T<:Observation[T]](query: T): Boolean = { + val tokenized = tokenizer(query.string).toSeq + val lemmatized = tokenized.map(lemmatizer.lemmatize(_)) + containsLemmatizedWords(lemmatized) + } + + override def toString(): String = { "" } + + /** Tags each token with the specified tag, if it is present in the lexicon */ + def tagLemmatizedText(tokens : Seq[Token], featureFunc : (Token => CategoricalVectorVar[String]), tag : String) : Unit = { + trie.tagMentions(tokens,featureFunc,tag) + } + + /** Tags each token with the specified tag, if the lemmatized form is present in the lexicon */ + def tagText(tokens : Seq[Token], featureFunc : (Token => CategoricalVectorVar[String]), tag : String) : Unit = { + trie.lemmatizeAndTagMentions(tokens,featureFunc,tag,lemmatizer) + } + + /** Tags each token with the specified tag, if the lemmatized form is present in the lexicon */ + def tagText(tokens : Seq[Token], featureFunc : (Token => CategoricalVectorVar[String]), tag : String, lemmaFunc : (Token => String)) : Unit = { + trie.tagMentions(tokens,featureFunc,tag,lemmaFunc) + } +} + diff --git a/src/main/scala/cc/factorie/app/nlp/lexicon/TrieUnionLexicon.scala b/src/main/scala/cc/factorie/app/nlp/lexicon/TrieUnionLexicon.scala new file mode 100644 index 0000000..8c4130b --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/lexicon/TrieUnionLexicon.scala @@ -0,0 +1,41 @@ +package cc.factorie.app.nlp.lexicon + +import cc.factorie.app.chain.Observation +import cc.factorie.app.nlp.Token +import cc.factorie.app.nlp.lemma.Lemmatizer +import cc.factorie.app.strings.StringSegmenter +import cc.factorie.variable.CategoricalVectorVar + + +/** + * A union lexicon of multiple TriePhraseLexicons. + * Has similar semantics to the TriePhraseLexicon. + */ +class TrieUnionLexicon[L <: TriePhraseLexicon](val name: String, val members: L*) extends MutableLexicon { + def tokenizer: StringSegmenter = members.head.tokenizer + def lemmatizer: Lemmatizer = members.head.lemmatizer + def containsLemmatizedWord(word: String): Boolean = members.exists(_.containsLemmatizedWord(word)) + def containsLemmatizedWords(word: Seq[String]): Boolean = members.exists(_.containsLemmatizedWords(word)) + def contains[T<:Observation[T]](query: T): Boolean = members.exists(_.contains(query)) + def contains[T<:Observation[T]](query: Seq[T]): Boolean = members.exists(_.contains(query)) + def +=(s:String): Unit = {throw new Error("TrieUnionLexicon is immutable. Append to the appropriate TriePhraseLexicon.")} + override def toString(): String = { + var st = "UNION { " + members.foreach(st += _.toString()+" , ") + st += " } " + st + } + + def tagLemmatizedText(tokens : Seq[Token], featureFunc : (Token => CategoricalVectorVar[String]), tag : String) : Unit = { + members.map(_.tagLemmatizedText(tokens,featureFunc,tag)) + } + + def tagText(tokens : Seq[Token], featureFunc : (Token => CategoricalVectorVar[String]), tag : String) : Unit = { + members.map(_.tagText(tokens,featureFunc,tag)) + } + + /** Tags each token with the specified tag, if the lemmatized form is present in the lexicon */ + def tagText(tokens : Seq[Token], featureFunc : (Token => CategoricalVectorVar[String]), tag : String, lemmaFunc : (Token => String)) : Unit = { + members.map(_.tagText(tokens,featureFunc,tag,lemmaFunc)) + } +} \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/nlp/lexicon/UnionLexicon.scala b/src/main/scala/cc/factorie/app/nlp/lexicon/UnionLexicon.scala new file mode 100644 index 0000000..ba9fb69 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/lexicon/UnionLexicon.scala @@ -0,0 +1,25 @@ +package cc.factorie.app.nlp.lexicon + +import cc.factorie.app.chain.Observation +import cc.factorie.app.nlp.lemma.Lemmatizer +import cc.factorie.app.strings.StringSegmenter + +/** a union of many PhraseLexicons + * + * @author Kate Silverstein */ +class UnionLexicon(val name: String, val members: PhraseLexicon*) extends MutableLexicon { + def tokenizer: StringSegmenter = members.head.tokenizer + def lemmatizer: Lemmatizer = members.head.lemmatizer + def containsLemmatizedWord(word: String): Boolean = members.exists(_.containsLemmatizedWord(word)) + def containsLemmatizedWords(word: Seq[String]): Boolean = members.exists(_.containsLemmatizedWords(word)) + def contains[T<:Observation[T]](query: T): Boolean = members.exists(_.contains(query)) + def contains[T<:Observation[T]](query: Seq[T]): Boolean = members.exists(_.contains(query)) + def +=(s:String): Unit = {throw new Error("method not implemented for UnionLexicon")} + override def toString: String = { + var st = "UNION { " + members.foreach(st += _.toString()+" , ") + st += " } " + st + } +} + diff --git a/src/main/scala/cc/factorie/app/nlp/lexicon/iesl/IeslLexicon.scala b/src/main/scala/cc/factorie/app/nlp/lexicon/iesl/IeslLexicon.scala index 018b363..9064657 100644 --- a/src/main/scala/cc/factorie/app/nlp/lexicon/iesl/IeslLexicon.scala +++ b/src/main/scala/cc/factorie/app/nlp/lexicon/iesl/IeslLexicon.scala @@ -12,45 +12,68 @@ limitations under the License. */ package cc.factorie.app.nlp.lexicon.iesl -import cc.factorie.app.nlp.lexicon.{TriePhraseLexicon, ProvidedTriePhraseLexicon} +import cc.factorie.app.nlp.lexicon.{ProvidedTriePhraseLexicon, TriePhraseLexicon} import cc.factorie.util.ModelProvider +import scala.io import scala.io.Source -/** - * @author johnsullivan - */ -class Continents()(implicit mp: ModelProvider[Continents]) extends ProvidedTriePhraseLexicon[Continents] -class Country()(implicit mp: ModelProvider[Country]) extends ProvidedTriePhraseLexicon[Country] -class City()(implicit mp: ModelProvider[City]) extends ProvidedTriePhraseLexicon[City] -class UsState()(implicit mp: ModelProvider[UsState]) extends ProvidedTriePhraseLexicon[UsState] -class PlaceSuffix()(implicit mp: ModelProvider[PlaceSuffix]) extends ProvidedTriePhraseLexicon[PlaceSuffix] -class JobTitle()(implicit mp: ModelProvider[JobTitle]) extends ProvidedTriePhraseLexicon[JobTitle] -class Money()(implicit mp: ModelProvider[Money]) extends ProvidedTriePhraseLexicon[Money] -class Company()(implicit mp: ModelProvider[Company]) extends ProvidedTriePhraseLexicon[Company] -class OrgSuffix()(implicit mp: ModelProvider[OrgSuffix]) extends ProvidedTriePhraseLexicon[OrgSuffix] -class Month()(implicit mp: ModelProvider[Month]) extends ProvidedTriePhraseLexicon[Month] -class Day()(implicit mp: ModelProvider[Day]) extends ProvidedTriePhraseLexicon[Day] -class PersonHonorific()(implicit mp: ModelProvider[PersonHonorific]) extends ProvidedTriePhraseLexicon[PersonHonorific] -class PersonFirstHighest()(implicit mp: ModelProvider[PersonFirstHighest]) extends ProvidedTriePhraseLexicon[PersonFirstHighest] -class PersonFirstHigh()(implicit mp: ModelProvider[PersonFirstHigh]) extends ProvidedTriePhraseLexicon[PersonFirstHigh] -class PersonFirstMedium()(implicit mp: ModelProvider[PersonFirstMedium]) extends ProvidedTriePhraseLexicon[PersonFirstMedium] -class PersonLastHighest()(implicit mp: ModelProvider[PersonLastHighest]) extends ProvidedTriePhraseLexicon[PersonLastHighest] -class PersonLastHigh()(implicit mp: ModelProvider[PersonLastHigh]) extends ProvidedTriePhraseLexicon[PersonLastHigh] -class PersonLastMedium()(implicit mp: ModelProvider[PersonLastMedium]) extends ProvidedTriePhraseLexicon[PersonLastMedium] -class Say()(implicit mp: ModelProvider[Say]) extends ProvidedTriePhraseLexicon[Say] -class Demonym()(implicit mp: ModelProvider[Demonym]) extends TriePhraseLexicon(classOf[Demonym].getName) { - synchronized { - Source.fromInputStream(mp.provide)(io.Codec.UTF8).getLines().flatMap(_.trim.split("\t")).foreach(this.+=) + + + /** + * @author johnsullivan + */ + class Continents()(implicit mp: ModelProvider[Continents]) extends ProvidedTriePhraseLexicon[Continents] + + class Country()(implicit mp: ModelProvider[Country]) extends ProvidedTriePhraseLexicon[Country] + + class City()(implicit mp: ModelProvider[City]) extends ProvidedTriePhraseLexicon[City] + + class UsState()(implicit mp: ModelProvider[UsState]) extends ProvidedTriePhraseLexicon[UsState] + + class PlaceSuffix()(implicit mp: ModelProvider[PlaceSuffix]) extends ProvidedTriePhraseLexicon[PlaceSuffix] + + class JobTitle()(implicit mp: ModelProvider[JobTitle]) extends ProvidedTriePhraseLexicon[JobTitle] + + class Money()(implicit mp: ModelProvider[Money]) extends ProvidedTriePhraseLexicon[Money] + + class Company()(implicit mp: ModelProvider[Company]) extends ProvidedTriePhraseLexicon[Company] + + class OrgSuffix()(implicit mp: ModelProvider[OrgSuffix]) extends ProvidedTriePhraseLexicon[OrgSuffix] + + class Month()(implicit mp: ModelProvider[Month]) extends ProvidedTriePhraseLexicon[Month] + + class Day()(implicit mp: ModelProvider[Day]) extends ProvidedTriePhraseLexicon[Day] + + class PersonHonorific()(implicit mp: ModelProvider[PersonHonorific]) extends ProvidedTriePhraseLexicon[PersonHonorific] + + class PersonFirstHighest()(implicit mp: ModelProvider[PersonFirstHighest]) extends ProvidedTriePhraseLexicon[PersonFirstHighest] + + class PersonFirstHigh()(implicit mp: ModelProvider[PersonFirstHigh]) extends ProvidedTriePhraseLexicon[PersonFirstHigh] + + class PersonFirstMedium()(implicit mp: ModelProvider[PersonFirstMedium]) extends ProvidedTriePhraseLexicon[PersonFirstMedium] + + class PersonLastHighest()(implicit mp: ModelProvider[PersonLastHighest]) extends ProvidedTriePhraseLexicon[PersonLastHighest] + + class PersonLastHigh()(implicit mp: ModelProvider[PersonLastHigh]) extends ProvidedTriePhraseLexicon[PersonLastHigh] + + class PersonLastMedium()(implicit mp: ModelProvider[PersonLastMedium]) extends ProvidedTriePhraseLexicon[PersonLastMedium] + + class Say()(implicit mp: ModelProvider[Say]) extends ProvidedTriePhraseLexicon[Say] + + class Demonym()(implicit mp: ModelProvider[Demonym]) extends TriePhraseLexicon(classOf[Demonym].getName) { + synchronized { + Source.fromInputStream(mp.provide)(io.Codec.UTF8).getLines().flatMap(_.trim.split("\t")).foreach(this.+=) + } } -} - -class DemonymMap()(implicit mp:ModelProvider[Demonym]) extends scala.collection.mutable.HashMap[String,String] { - synchronized { - Source.fromInputStream(mp.provide)(io.Codec.UTF8).getLines().foreach { line => - val entries = line.trim.split("\t") - val value = entries.head - entries.foreach(e => this.update(e, value)) + + class DemonymMap()(implicit mp: ModelProvider[Demonym]) extends scala.collection.mutable.HashMap[String, String] { + synchronized { + Source.fromInputStream(mp.provide)(io.Codec.UTF8).getLines().foreach { line => + val entries = line.trim.split("\t") + val value = entries.head + entries.foreach(e => this.update(e, value)) + } } } -} + diff --git a/src/main/scala/cc/factorie/app/nlp/lexicon/iesl/es/IeslSpanishLexicon.scala b/src/main/scala/cc/factorie/app/nlp/lexicon/iesl/es/IeslSpanishLexicon.scala deleted file mode 100644 index 897c996..0000000 --- a/src/main/scala/cc/factorie/app/nlp/lexicon/iesl/es/IeslSpanishLexicon.scala +++ /dev/null @@ -1,20 +0,0 @@ -package cc.factorie.app.nlp.lexicon.iesl.es - -import cc.factorie.app.nlp.lexicon.{TriePhraseLexicon, ProvidedTriePhraseLexicon} -import cc.factorie.util.ModelProvider - -import scala.io.Source - -class Continents()(implicit mp: ModelProvider[Continents]) extends ProvidedTriePhraseLexicon[Continents] -class Day()(implicit mp: ModelProvider[Day]) extends ProvidedTriePhraseLexicon[Day] -class Month()(implicit mp: ModelProvider[Month]) extends ProvidedTriePhraseLexicon[Month] -class PersonFirst()(implicit mp: ModelProvider[PersonFirst]) extends ProvidedTriePhraseLexicon[PersonFirst] -class PersonLast()(implicit mp: ModelProvider[PersonLast]) extends ProvidedTriePhraseLexicon[PersonLast] -class Location()(implicit mp: ModelProvider[Location]) extends ProvidedTriePhraseLexicon[Location] -class Miscellaneous()(implicit mp: ModelProvider[Miscellaneous]) extends ProvidedTriePhraseLexicon[Miscellaneous] -class Person()(implicit mp: ModelProvider[Person]) extends ProvidedTriePhraseLexicon[Person] -class PersonHonorific()(implicit mp: ModelProvider[PersonHonorific]) extends ProvidedTriePhraseLexicon[PersonHonorific] -class Organization()(implicit mp: ModelProvider[Organization]) extends ProvidedTriePhraseLexicon[Organization] -class OrgSuffix()(implicit mp: ModelProvider[OrgSuffix]) extends ProvidedTriePhraseLexicon[OrgSuffix] -class Demonym()(implicit mp: ModelProvider[Demonym]) extends ProvidedTriePhraseLexicon[Demonym] - diff --git a/src/main/scala/cc/factorie/app/nlp/lexicon/mandarin/MandarinLexicon.scala b/src/main/scala/cc/factorie/app/nlp/lexicon/mandarin/MandarinLexicon.scala deleted file mode 100644 index d420a4f..0000000 --- a/src/main/scala/cc/factorie/app/nlp/lexicon/mandarin/MandarinLexicon.scala +++ /dev/null @@ -1,22 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.app.nlp.lexicon.mandarin - -import cc.factorie.app.nlp.lexicon.ProvidedTriePhraseLexicon -import cc.factorie.util.ModelProvider - -/** - * @author johnsullivan - */ -class SurnamePinyin()(implicit mp:ModelProvider[SurnamePinyin]) extends ProvidedTriePhraseLexicon[SurnamePinyin] -class GivenNamePinyin()(implicit mp:ModelProvider[GivenNamePinyin]) extends ProvidedTriePhraseLexicon[GivenNamePinyin] diff --git a/src/main/scala/cc/factorie/app/nlp/lexicon/package.scala b/src/main/scala/cc/factorie/app/nlp/lexicon/package.scala deleted file mode 100644 index a20ce40..0000000 --- a/src/main/scala/cc/factorie/app/nlp/lexicon/package.scala +++ /dev/null @@ -1,24 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.app.nlp - -package object lexicon { - /* - val iesl = ClasspathResourceLexicons.iesl - val uscensus = ClasspathResourceLexicons.uscensus - val wikipedia = ClasspathResourceLexicons.wikipedia - val ssdi = ClasspathResourceLexicons.ssdi - val mandarin = ClasspathResourceLexicons.mandarin - */ - -} diff --git a/src/main/scala/cc/factorie/app/nlp/lexicon/ssdi/SsdiLexicon.scala b/src/main/scala/cc/factorie/app/nlp/lexicon/ssdi/SsdiLexicon.scala index 97325c2..30fc082 100644 --- a/src/main/scala/cc/factorie/app/nlp/lexicon/ssdi/SsdiLexicon.scala +++ b/src/main/scala/cc/factorie/app/nlp/lexicon/ssdi/SsdiLexicon.scala @@ -18,10 +18,17 @@ import cc.factorie.util.ModelProvider /** * @author johnsullivan */ -class PersonFirstHighest()(implicit mp: ModelProvider[PersonFirstHighest]) extends ProvidedTriePhraseLexicon[PersonFirstHighest] -class PersonFirstHigh()(implicit mp: ModelProvider[PersonFirstHigh]) extends ProvidedTriePhraseLexicon[PersonFirstHigh] -class PersonFirstMedium()(implicit mp: ModelProvider[PersonFirstMedium]) extends ProvidedTriePhraseLexicon[PersonFirstMedium] -class PersonLastHighest()(implicit mp: ModelProvider[PersonLastHighest]) extends ProvidedTriePhraseLexicon[PersonLastHighest] -class PersonLastHigh()(implicit mp: ModelProvider[PersonLastHigh]) extends ProvidedTriePhraseLexicon[PersonLastHigh] -class PersonLastMedium()(implicit mp: ModelProvider[PersonLastMedium]) extends ProvidedTriePhraseLexicon[PersonLastMedium] + + + class PersonFirstHighest()(implicit mp: ModelProvider[PersonFirstHighest]) extends ProvidedTriePhraseLexicon[PersonFirstHighest] + + class PersonFirstHigh()(implicit mp: ModelProvider[PersonFirstHigh]) extends ProvidedTriePhraseLexicon[PersonFirstHigh] + + class PersonFirstMedium()(implicit mp: ModelProvider[PersonFirstMedium]) extends ProvidedTriePhraseLexicon[PersonFirstMedium] + + class PersonLastHighest()(implicit mp: ModelProvider[PersonLastHighest]) extends ProvidedTriePhraseLexicon[PersonLastHighest] + + class PersonLastHigh()(implicit mp: ModelProvider[PersonLastHigh]) extends ProvidedTriePhraseLexicon[PersonLastHigh] + + class PersonLastMedium()(implicit mp: ModelProvider[PersonLastMedium]) extends ProvidedTriePhraseLexicon[PersonLastMedium] diff --git a/src/main/scala/cc/factorie/app/nlp/lexicon/uscensus/UscensusLexicon.scala b/src/main/scala/cc/factorie/app/nlp/lexicon/uscensus/UscensusLexicon.scala index 16fb5f4..45b73d3 100644 --- a/src/main/scala/cc/factorie/app/nlp/lexicon/uscensus/UscensusLexicon.scala +++ b/src/main/scala/cc/factorie/app/nlp/lexicon/uscensus/UscensusLexicon.scala @@ -19,6 +19,10 @@ import cc.factorie.util.ModelProvider * @author johnsullivan */ -class PersonFirstFemale()(implicit mp :ModelProvider[PersonFirstFemale]) extends ProvidedTriePhraseLexicon[PersonFirstFemale] -class PersonFirstMale()(implicit mp :ModelProvider[PersonFirstMale]) extends ProvidedTriePhraseLexicon[PersonFirstMale] -class PersonLast()(implicit mp :ModelProvider[PersonLast]) extends ProvidedTriePhraseLexicon[PersonLast] + + class PersonFirstFemale()(implicit mp: ModelProvider[PersonFirstFemale]) extends ProvidedTriePhraseLexicon[PersonFirstFemale] + + class PersonFirstMale()(implicit mp: ModelProvider[PersonFirstMale]) extends ProvidedTriePhraseLexicon[PersonFirstMale] + + class PersonLast()(implicit mp: ModelProvider[PersonLast]) extends ProvidedTriePhraseLexicon[PersonLast] + diff --git a/src/main/scala/cc/factorie/app/nlp/lexicon/wikipedia/WikipediaLexicon.scala b/src/main/scala/cc/factorie/app/nlp/lexicon/wikipedia/WikipediaLexicon.scala index 758b2d9..5d83aba 100644 --- a/src/main/scala/cc/factorie/app/nlp/lexicon/wikipedia/WikipediaLexicon.scala +++ b/src/main/scala/cc/factorie/app/nlp/lexicon/wikipedia/WikipediaLexicon.scala @@ -18,36 +18,72 @@ import cc.factorie.util.ModelProvider /** * @author johnsullivan */ -class Battle()(implicit mp:ModelProvider[Battle]) extends ProvidedTriePhraseLexicon[Battle] -class BattleRedirect()(implicit mp:ModelProvider[BattleRedirect]) extends ProvidedTriePhraseLexicon[BattleRedirect] -class BattleDisambiguation()(implicit mp:ModelProvider[BattleDisambiguation]) extends ProvidedTriePhraseLexicon[BattleDisambiguation] -class Book()(implicit mp:ModelProvider[Book]) extends ProvidedTriePhraseLexicon[Book] -class BookRedirect()(implicit mp:ModelProvider[BookRedirect]) extends ProvidedTriePhraseLexicon[BookRedirect] -class BookDisambiguation()(implicit mp:ModelProvider[BookDisambiguation]) extends ProvidedTriePhraseLexicon[BookDisambiguation] -class Business()(implicit mp:ModelProvider[Business]) extends ProvidedTriePhraseLexicon[Business] -class BusinessRedirect()(implicit mp:ModelProvider[BusinessRedirect]) extends ProvidedTriePhraseLexicon[BusinessRedirect] -class BusinessDisambiguation()(implicit mp:ModelProvider[BusinessDisambiguation]) extends ProvidedTriePhraseLexicon[BusinessDisambiguation] -class Competition()(implicit mp:ModelProvider[Competition]) extends ProvidedTriePhraseLexicon[Competition] -class CompetitionRedirect()(implicit mp:ModelProvider[CompetitionRedirect]) extends ProvidedTriePhraseLexicon[CompetitionRedirect] -class CompetitionDisambiguation()(implicit mp:ModelProvider[CompetitionDisambiguation]) extends ProvidedTriePhraseLexicon[CompetitionDisambiguation] -class Event()(implicit mp:ModelProvider[Event]) extends ProvidedTriePhraseLexicon[Event] -class EventRedirect()(implicit mp:ModelProvider[EventRedirect]) extends ProvidedTriePhraseLexicon[EventRedirect] -class EventDisambiguation()(implicit mp:ModelProvider[EventDisambiguation]) extends ProvidedTriePhraseLexicon[EventDisambiguation] -class Film()(implicit mp:ModelProvider[Film]) extends ProvidedTriePhraseLexicon[Film] -class FilmRedirect()(implicit mp:ModelProvider[FilmRedirect]) extends ProvidedTriePhraseLexicon[FilmRedirect] -class FilmDisambiguation()(implicit mp:ModelProvider[FilmDisambiguation]) extends ProvidedTriePhraseLexicon[FilmDisambiguation] -class Location()(implicit mp:ModelProvider[Location]) extends ProvidedTriePhraseLexicon[Location] -class LocationRedirect()(implicit mp:ModelProvider[LocationRedirect]) extends ProvidedTriePhraseLexicon[LocationRedirect] -class LocationDisambiguation()(implicit mp:ModelProvider[LocationDisambiguation]) extends ProvidedTriePhraseLexicon[LocationDisambiguation] -class ManMadeThing()(implicit mp:ModelProvider[ManMadeThing]) extends ProvidedTriePhraseLexicon[ManMadeThing] -class ManMadeThingRedirect()(implicit mp:ModelProvider[ManMadeThingRedirect]) extends ProvidedTriePhraseLexicon[ManMadeThingRedirect] -class ManMadeThingDisambiguation()(implicit mp:ModelProvider[ManMadeThingDisambiguation]) extends ProvidedTriePhraseLexicon[ManMadeThingDisambiguation] -class Organization()(implicit mp:ModelProvider[Organization]) extends ProvidedTriePhraseLexicon[Organization] -class OrganizationRedirect()(implicit mp:ModelProvider[OrganizationRedirect]) extends ProvidedTriePhraseLexicon[OrganizationRedirect] -class OrganizationDisambiguation()(implicit mp:ModelProvider[OrganizationDisambiguation]) extends ProvidedTriePhraseLexicon[OrganizationDisambiguation] -class Person()(implicit mp:ModelProvider[Person]) extends ProvidedTriePhraseLexicon[Person] -class PersonRedirect()(implicit mp:ModelProvider[PersonRedirect]) extends ProvidedTriePhraseLexicon[PersonRedirect] -class PersonDisambiguation()(implicit mp:ModelProvider[PersonDisambiguation]) extends ProvidedTriePhraseLexicon[PersonDisambiguation] -class Song()(implicit mp:ModelProvider[Song]) extends ProvidedTriePhraseLexicon[Song] -class SongRedirect()(implicit mp:ModelProvider[SongRedirect]) extends ProvidedTriePhraseLexicon[SongRedirect] -class SongDisambiguation()(implicit mp:ModelProvider[SongDisambiguation]) extends ProvidedTriePhraseLexicon[SongDisambiguation] + + + + class Battle()(implicit mp: ModelProvider[Battle]) extends ProvidedTriePhraseLexicon[Battle] + + class BattleRedirect()(implicit mp: ModelProvider[BattleRedirect]) extends ProvidedTriePhraseLexicon[BattleRedirect] + + class BattleDisambiguation()(implicit mp: ModelProvider[BattleDisambiguation]) extends ProvidedTriePhraseLexicon[BattleDisambiguation] + + class Book()(implicit mp: ModelProvider[Book]) extends ProvidedTriePhraseLexicon[Book] + + class BookRedirect()(implicit mp: ModelProvider[BookRedirect]) extends ProvidedTriePhraseLexicon[BookRedirect] + + class BookDisambiguation()(implicit mp: ModelProvider[BookDisambiguation]) extends ProvidedTriePhraseLexicon[BookDisambiguation] + + class Business()(implicit mp: ModelProvider[Business]) extends ProvidedTriePhraseLexicon[Business] + + class BusinessRedirect()(implicit mp: ModelProvider[BusinessRedirect]) extends ProvidedTriePhraseLexicon[BusinessRedirect] + + class BusinessDisambiguation()(implicit mp: ModelProvider[BusinessDisambiguation]) extends ProvidedTriePhraseLexicon[BusinessDisambiguation] + + class Competition()(implicit mp: ModelProvider[Competition]) extends ProvidedTriePhraseLexicon[Competition] + + class CompetitionRedirect()(implicit mp: ModelProvider[CompetitionRedirect]) extends ProvidedTriePhraseLexicon[CompetitionRedirect] + + class CompetitionDisambiguation()(implicit mp: ModelProvider[CompetitionDisambiguation]) extends ProvidedTriePhraseLexicon[CompetitionDisambiguation] + + class Event()(implicit mp: ModelProvider[Event]) extends ProvidedTriePhraseLexicon[Event] + + class EventRedirect()(implicit mp: ModelProvider[EventRedirect]) extends ProvidedTriePhraseLexicon[EventRedirect] + + class EventDisambiguation()(implicit mp: ModelProvider[EventDisambiguation]) extends ProvidedTriePhraseLexicon[EventDisambiguation] + + class Film()(implicit mp: ModelProvider[Film]) extends ProvidedTriePhraseLexicon[Film] + + class FilmRedirect()(implicit mp: ModelProvider[FilmRedirect]) extends ProvidedTriePhraseLexicon[FilmRedirect] + + class FilmDisambiguation()(implicit mp: ModelProvider[FilmDisambiguation]) extends ProvidedTriePhraseLexicon[FilmDisambiguation] + + class Location()(implicit mp: ModelProvider[Location]) extends ProvidedTriePhraseLexicon[Location] + + class LocationRedirect()(implicit mp: ModelProvider[LocationRedirect]) extends ProvidedTriePhraseLexicon[LocationRedirect] + + class LocationDisambiguation()(implicit mp: ModelProvider[LocationDisambiguation]) extends ProvidedTriePhraseLexicon[LocationDisambiguation] + + class ManMadeThing()(implicit mp: ModelProvider[ManMadeThing]) extends ProvidedTriePhraseLexicon[ManMadeThing] + + class ManMadeThingRedirect()(implicit mp: ModelProvider[ManMadeThingRedirect]) extends ProvidedTriePhraseLexicon[ManMadeThingRedirect] + + class ManMadeThingDisambiguation()(implicit mp: ModelProvider[ManMadeThingDisambiguation]) extends ProvidedTriePhraseLexicon[ManMadeThingDisambiguation] + + class Organization()(implicit mp: ModelProvider[Organization]) extends ProvidedTriePhraseLexicon[Organization] + + class OrganizationRedirect()(implicit mp: ModelProvider[OrganizationRedirect]) extends ProvidedTriePhraseLexicon[OrganizationRedirect] + + class OrganizationDisambiguation()(implicit mp: ModelProvider[OrganizationDisambiguation]) extends ProvidedTriePhraseLexicon[OrganizationDisambiguation] + + class Person()(implicit mp: ModelProvider[Person]) extends ProvidedTriePhraseLexicon[Person] + + class PersonRedirect()(implicit mp: ModelProvider[PersonRedirect]) extends ProvidedTriePhraseLexicon[PersonRedirect] + + class PersonDisambiguation()(implicit mp: ModelProvider[PersonDisambiguation]) extends ProvidedTriePhraseLexicon[PersonDisambiguation] + + class Song()(implicit mp: ModelProvider[Song]) extends ProvidedTriePhraseLexicon[Song] + + class SongRedirect()(implicit mp: ModelProvider[SongRedirect]) extends ProvidedTriePhraseLexicon[SongRedirect] + + class SongDisambiguation()(implicit mp: ModelProvider[SongDisambiguation]) extends ProvidedTriePhraseLexicon[SongDisambiguation] + diff --git a/src/main/scala/cc/factorie/app/nlp/lexicon/wikipedia/es/WikipediaLexicon.scala b/src/main/scala/cc/factorie/app/nlp/lexicon/wikipedia/es/WikipediaLexicon.scala deleted file mode 100644 index c414bfb..0000000 --- a/src/main/scala/cc/factorie/app/nlp/lexicon/wikipedia/es/WikipediaLexicon.scala +++ /dev/null @@ -1,16 +0,0 @@ -package cc.factorie.app.nlp.lexicon.wikipedia.es - -import cc.factorie.app.nlp.lexicon.ProvidedTriePhraseLexicon -import cc.factorie.util.ModelProvider - -class Book()(implicit mp:ModelProvider[Book]) extends ProvidedTriePhraseLexicon[Book] -class Film()(implicit mp:ModelProvider[Film]) extends ProvidedTriePhraseLexicon[Film] -class Event()(implicit mp:ModelProvider[Event]) extends ProvidedTriePhraseLexicon[Event] -class Business()(implicit mp:ModelProvider[Business]) extends ProvidedTriePhraseLexicon[Business] -class Location()(implicit mp:ModelProvider[Location]) extends ProvidedTriePhraseLexicon[Location] -class LocationRedirect()(implicit mp:ModelProvider[LocationRedirect]) extends ProvidedTriePhraseLexicon[LocationRedirect] -class Organization()(implicit mp:ModelProvider[Organization]) extends ProvidedTriePhraseLexicon[Organization] -class OrganizationRedirect()(implicit mp:ModelProvider[OrganizationRedirect]) extends ProvidedTriePhraseLexicon[OrganizationRedirect] -class Person()(implicit mp:ModelProvider[Person]) extends ProvidedTriePhraseLexicon[Person] -class PersonRedirect()(implicit mp:ModelProvider[PersonRedirect]) extends ProvidedTriePhraseLexicon[PersonRedirect] - diff --git a/src/main/scala/cc/factorie/app/nlp/load/Load.scala b/src/main/scala/cc/factorie/app/nlp/load/Load.scala index df79a91..f2583c4 100644 --- a/src/main/scala/cc/factorie/app/nlp/load/Load.scala +++ b/src/main/scala/cc/factorie/app/nlp/load/Load.scala @@ -12,7 +12,8 @@ limitations under the License. */ package cc.factorie.app.nlp.load -import cc.factorie.app.nlp._ + +import cc.factorie.app.nlp.Document import cc.factorie.util.ISAble import scala.io.Source @@ -29,9 +30,3 @@ trait Load { def fromFilename(filename:String, encoding:String = "UTF-8"): Seq[Document] = fromFile(new java.io.File(filename), encoding) def fromISAble[A](a:A)(implicit conv:ISAble[A]) = fromStream(conv(a)) } - -/** The interface common to objects that create Documents from the files in a directory. - @author Andrew McCallum */ -trait LoadDirectory { - def fromDirectory(dir:java.io.File): Seq[Document] -} \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/nlp/load/LoadACE.scala b/src/main/scala/cc/factorie/app/nlp/load/LoadACE.scala deleted file mode 100644 index 9854c31..0000000 --- a/src/main/scala/cc/factorie/app/nlp/load/LoadACE.scala +++ /dev/null @@ -1,168 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -/* -package cc.factorie.app.nlp.load -import java.io.File - -import cc.factorie.app.nlp._ -import cc.factorie.app.nlp.coref.Mention -import cc.factorie.app.nlp.phrase.Phrase -import cc.factorie.app.nlp.segment._ - -import scala.xml.{NodeSeq, XML} - -// TODO: consider moving this info into variables. -case class ACEEntityIdentifiers(eId: String, eType: String, eSubtype: String, eClass: String) - -case class ACEMentionIdentifiers(mId: String, mType: String, ldcType: String, offsetStart: Int, offsetEnd: Int) - -case class ACERelationIdentifiers(rId: String, rType: String, rSubtype: String) - -case class ACEFileIdentifier(fileId: String) - -object LoadACE { - - private val matchTag = "<[A-Za-z=_\"/ ]*>".r - - private def makeDoc(sgm: String): Document = { - val source = io.Source.fromFile(sgm) - val sgmString = source.mkString - source.close() - val doc = new Document(matchTag.replaceAllIn(sgmString, _ => "")).setName(sgm) - doc.attr += new ACEFileIdentifier(sgm.dropRight(4) + ".apf.xml") - DeterministicNormalizingTokenizer.process(doc) - DeterministicSentenceSegmenter.process(doc) - - // trailing tokens should be in a sentence - val end = doc.asSection.sentences.last.end - if (end != doc.asSection.length - 1) - new Sentence(doc.asSection, end + 1, doc.asSection.length - 1 - end) - doc - } - - private def tokenIndexAtCharIndex(charOffset: Int, doc: Document): Int = { - require(charOffset >= 0 && charOffset <= doc.string.length) - var i = 0 - for (t <- doc.tokens) { - if (t.stringStart <= charOffset && charOffset <= t.stringEnd) return i - i += 1 - } - -1 - } - - private def getTokenIdxAndLength(mention: NodeSeq, doc: Document): (Int, Int) = { - val start = getAttr(mention \ "extent" \ "charseq", "START").toInt - val end = getAttr(mention \ "extent" \ "charseq", "END").toInt + 1 - val startTokenIdx = tokenIndexAtCharIndex(start, doc) - val endTokenIdx = tokenIndexAtCharIndex(end, doc) - (startTokenIdx, endTokenIdx - startTokenIdx + 1) - } - - private def getAttr(ns: NodeSeq, key: String): String = { - val k = ns(0).attribute(key).getOrElse(null) - if (k != null) k.text - else "None" - } - - def addMentionsFromApf(apf: NodeSeq, doc: Document): Unit = { - val coref = doc.getTargetCoref - for (entity <- apf \\ "entity") { - val entityKey = (entity \ "entity_attributes" \ "name" \ "charseq").text - val e = coref.entityFromUniqueId(entityKey) - e.attr += ACEEntityIdentifiers(eId = getAttr(entity, "ID"), eType = getAttr(entity, "TYPE"), eSubtype = getAttr(entity, "SUBTYPE"), eClass = getAttr(entity, "CLASS")) - - for (mention <- entity \ "entity_mention") { - val (start, length) = getTokenIdxAndLength(mention, doc) - val headCharIndex = getAttr(mention \ "head" \ "charseq", "END").toInt - - val headLeftCharIndex = getAttr(mention \ "head" \ "charseq", "START").toInt - try { - // set head token to the rightmost token of the ACE head - val tokIndRight = tokenIndexAtCharIndex(headCharIndex, doc) - val phrase = new Phrase(doc.asSection, start, length, tokIndRight-start) - if (phrase.sentence == null) println("NULL mention: (%d, %d) -> %s".format(start, length, phrase.string)) - - val newMention = coref.addMention(phrase) - e += newMention - newMention.attr += new ACEMentionIdentifiers(mId = getAttr(mention, "ID"), mType = getAttr(mention, "TYPE"), ldcType = getAttr(mention, "LDCTYPE"), offsetStart = getAttr(mention \ "extent" \ "charseq", "START").toInt, offsetEnd = getAttr(mention \ "extent" \ "charseq", "END").toInt) - - } catch { - case e: Exception => - println("doc: " + doc.tokens.mkString("\n")) - println("mention: " + mention) - println("headIndex: " + headCharIndex) - println("headLeftIndex: " + headLeftCharIndex) - e.printStackTrace() - System.exit(1) - } - } - } - } - - private def lookupEntityMention(id: String, doc: Document): Mention = - doc.targetCoref.mentions.find { - m => - val a = m.attr[ACEMentionIdentifiers] - a != null && a.mId == id - }.get - -// def addRelationsFromApf(apf: NodeSeq, doc: Document): Unit = { -// doc.getCoref -// for (relation <- apf \\ "relation") { -// val identifiers = new ACERelationIdentifiers(rId = getAttr(relation, "ID"), rType = getAttr(relation, "TYPE"), rSubtype = getAttr(relation, "SUBTYPE")) -// -// for (mention <- relation \ "relation_mention") { -// val args = mention \ "relation_mention_argument" map { -// arg => lookupEntityMention(getAttr(arg, "REFID"), doc) -// } -// assert(args.size == 2) -// val m = new RelationMention(args.head, args.last, identifiers.rType, Some(identifiers.rSubtype)) -// if (m.arg1.phrase.sentence != m.arg2.phrase.sentence) println("sentence doesn't match") -// m.attr += identifiers -// doc.attr[RelationMentions].add(m)(null) -// args.foreach(_.attr.getOrElseUpdate(new RelationMentions).add(m)(null)) -// } -// } -// } - - // drops the first two lines (xml decl, and dtd) - private def loadXML(apfFile: String): NodeSeq = { - val source = io.Source.fromFile(apfFile) - val result = XML.loadString(source.getLines().drop(2).mkString("\n")) - source.close() - result - } - - // TODO: consider renaming this to fromFile to match the API for other loaders. - // But if renamed, how can the user know that apf.xml is required (instead of alf.xml or .xml)? - def fromApf(apfFile: String): Document = fromApf(apfFile, makeDoc(apfFile.dropRight(8) + ".sgm")) - - def fromApf(apfFile: String, doc: Document): Document = { - addMentionsFromApf(loadXML(apfFile), doc) - //addRelationsFromApf(loadXML(apfFile), doc) - doc - } - - def fromDirectory(dir: String, takeOnly: Int = Int.MaxValue): Seq[Document] = - new File(dir).listFiles().filter(_.getName.endsWith(".apf.xml")).take(takeOnly).map(f => fromApf(f.getAbsolutePath)) - - def main(args: Array[String]): Unit = { - val docs = fromDirectory(args(0)) - println("docs: " + docs.size) - for (d <- docs) - d.getTargetCoref.mentions.foreach(s => println(s)) - } - -} -*/ \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/nlp/load/LoadAPFCoref.scala b/src/main/scala/cc/factorie/app/nlp/load/LoadAPFCoref.scala deleted file mode 100644 index 60d83af..0000000 --- a/src/main/scala/cc/factorie/app/nlp/load/LoadAPFCoref.scala +++ /dev/null @@ -1,324 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -/* -package cc.factorie.app.nlp.load - -import java.io._ -import java.util.zip.GZIPInputStream - -import cc.factorie.app.nlp._ -import cc.factorie.app.nlp.coref.WithinDocCoref -import cc.factorie.app.nlp.phrase.Phrase -import cc.factorie.app.nlp.pos.PennPosTag -import cc.factorie.app.nlp.segment.{DeterministicNormalizingTokenizer, DeterministicSentenceSegmenter} -import cc.factorie.util.{DefaultCmdOptions, NonValidatingXML} - -import scala.io.Source -import scala.util.matching.Regex -import scala.xml.Node -*/ -/** - * Takes a document and an apf.xml file that contains coreference annotation for that - * document and annotates that document. - * - * If the document already has a (target) coref, this will overwrite mentions that - * overlap with the mentions annotated here. - * - * @author John Sullivan - */ -/* -class LoadAPFCoref(mentions:Seq[SerializableAPFMention], loadAsTarget:Boolean) extends DocumentAnnotator { - - def this(apfFile:File, loadAsTarget:Boolean = true) = this({ - val src = new BufferedInputStream(new FileInputStream(apfFile)) - val offsets = SerializableAPFMention.fromAPFXML(NonValidatingXML load src) - src.close() - offsets - }, loadAsTarget) - - def tokenAnnotationString(token: Token) = null - - def prereqAttrs = Seq(classOf[Token], classOf[Sentence], classOf[PennPosTag]) - def postAttrs = Seq(classOf[WithinDocCoref]) - - def fixOffsets(span:(Int, Int))(implicit offset:OffsetMapper):(Int, Int) = { - val (start, end) = span - val startAdj = offset.fixOffset(start) - val endAdj = startAdj + (end - start) - startAdj -> endAdj - } - - def processFromOffsets(offset:OffsetMapper, document:Document):Document = { - val coref = if(loadAsTarget) document.getTargetCoref else document.getCoref - mentions.sortBy{case SerializableAPFMention(_, _, _, _, (start, end), _) => (end - start) * -1} // sorting by length here means only the longest of overlapping mentions will be loaded later. - .foreach { case SerializableAPFMention(_, entId, entName, mentId, mentSpan, mentHeadSpan) => - val ent = coref.entityFromUniqueId(entId) - if(ent.canonicalName != null && entName.isDefined) { - ent.canonicalName = entName.get - } - val (mentStart, mentEnd) = fixOffsets(mentSpan)(offset) - val (mentHeadStart, mentHeadEnd) = fixOffsets(mentHeadSpan)(offset) - - document.getSectionByOffsets(mentStart, mentEnd).foreach { sec => - sec.tokens.dropWhile(_.stringEnd <= mentStart).takeWhile(_.stringStart <= mentEnd) match { - case toks if toks.size != 0 => - val headIndex = toks.dropWhile(_.stringEnd <= mentHeadStart).takeWhile(_.stringStart <= mentHeadEnd).headOption.map( t => t.position - toks.head.position).getOrElse(0) - val tokSpan = new TokenSpan(toks) - coref.findOverlapping(tokSpan) match { - case Some(existingMention) => ent += existingMention - case None => coref.addMention(new Phrase(tokSpan, headIndex), ent) - } - case _ => () - } - } - } - coref.trimEmptyEntities() - document - } - - //todo do we want to add NER Types while we're at it? - def process(document: Document) = { - - implicit val offset = new OffsetMapper(document.string) - // side effects! - processFromOffsets(offset, document) - - document.annotators += classOf[WithinDocCoref] -> classOf[LoadAPFCoref] - document - } -} - -class OffsetMapper(val offsets:Seq[(Int, Int)]) { - def this(rawText:String) = this{ - var numXMLChars = 0 - new Regex("""<[/\w\d "=]+>""").findAllIn(rawText).matchData.map{ m => - numXMLChars += m.matched.length - math.max(0, m.start - numXMLChars) -> numXMLChars - }.toSeq - } - - def this(f:File) = this{ - val src = Source.fromFile(f) - val docString = src.mkString("\n") - src.close() - docString - } - - - - def fixOffset(apfOffset:Int) = offsets.takeWhile(_._1 <= apfOffset ).lastOption.getOrElse(0 -> 0)._2 + apfOffset - - def serialize:String = offsets.map{case (i, j) => i + "?" + j}.mkString(",") -} - -object OffsetMapper { - def deserialize(str:String):OffsetMapper = new OffsetMapper(str.split(",").map{ s => - val Array(i, j) = s.split('?') - i.toInt -> j.toInt - }.toSeq) - - def buildMapperLine(docId:String, docString:String):String = docId + "\t" + new OffsetMapper(docString).serialize - - - - def tacDocumentSplitter(tacDocFile:File):Iterator[String] = new Iterator[String] { - - private val docEndString = """""" - private val webDocStartString = """""" - private val docIdRegex = """(?i)]*>""".r - private val webDocIdRegex = """(?i) ([^ ])+ """.r - - private val tacReader = if(tacDocFile.getName.endsWith(".gz")) { - new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(tacDocFile)))) - } else { - new BufferedReader(new InputStreamReader(new FileInputStream(tacDocFile))) - } - - //priming the pump - private var line = tacReader.readLine() - private var lineNum = 1 - - // grouping together to avoid forgetting something - @inline - private def advanceLine(docBuffer:StringBuilder) { - docBuffer append line - docBuffer append "\n" - line = tacReader.readLine() - lineNum += 1 - } - - - def next() = { - val docBuffer = new StringBuilder() - - - var docIdMatchOpt = docIdRegex.findFirstMatchIn(line) - - // We should be at the start of a new document here, otherwise we have a problem. - assert(line.equalsIgnoreCase(webDocStartString) || docIdMatchOpt.isDefined, "Found line: |%s| that was not a valid doc start at line %d in %s".format(line, lineNum, tacDocFile.getName)) - - val docId = if(docIdMatchOpt.isDefined) { - docIdMatchOpt.get.toString() - } else if(line equalsIgnoreCase webDocStartString) { // we know that one must be true but let's not tempt fate - advanceLine(docBuffer) - webDocIdRegex.findFirstMatchIn(line).get.toString() - } else { - throw new Exception("Found line: |%s| that was not a valid doc start at line %d in %s".format(line, lineNum, tacDocFile.getName)) - } - - while(!line.equalsIgnoreCase(docEndString)) { - advanceLine(docBuffer) - } - // the loop exits when the doc end is found, but that us still part of the previous document so we need to consume it. - advanceLine(docBuffer) - docBuffer.toString() - } - - def hasNext = line != null - } - - def splitByLine(docOffsetFile:String, tacRoot:String, outputFile:String) { - - val docEndString = """""" - val webDocStartString = """""" - val docIdRegex = """(?i)]*>""".r - val webDocIdRegex = """(?i) ([^ ])+ """.r - val wrt = new BufferedWriter(new FileWriter(outputFile)) - var count = 0 - var lineCount = 0 - var docStringBuf = new StringBuilder() - var prevLine = null.asInstanceOf[String] - var line = null.asInstanceOf[String] - - Source.fromFile(docOffsetFile).getLines().map {_.split('\t').apply(1)}.toSet.foreach{ filePath:String => - val tacFileReader = new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(tacRoot + "/" + filePath)), "UTF-8")) - line = tacFileReader.readLine() - lineCount += 1 - - - var docIdRegex(docId) = line - while (line != null) { - docStringBuf append line - if(line equalsIgnoreCase docEndString) { - // write out the doc offsets - wrt write buildMapperLine(docId, docStringBuf.toString()) - wrt.newLine() - - // clear out the buffer - docStringBuf = new StringBuilder() - - // report our results - if(count % 1000 == 0) { // this number is a total guess - println("Wrote offsets for %d files".format(count)) - wrt.flush() - } - count += 1 - } - prevLine = line - line = tacFileReader.readLine() - if(line != null && prevLine != null && prevLine.equalsIgnoreCase(docEndString)) { - var docIdRegex(docId) = line - } - } - } - - wrt.flush() - wrt.close() - println("Wrote offsets for %d files".format(count)) - - } - - def main(args:Array[String]) { - val opts = new OffsetMapperOpts - opts.parse(args) - - splitByLine(opts.docOffsetFile.value, opts.tacRoot.value, opts.outputFile.value) - - } - -} - -class OffsetMapperOpts extends DefaultCmdOptions { - val docOffsetFile = new CmdOption("doc-offset", "", "FILE", "A file containing the offsets of documents into the raw tac document.") - val tacRoot = new CmdOption("tac-root", "", "DIRECTORY", "The root directory in which tac data is stored.") - val outputFile = new CmdOption("output-file", "", "FILE", "The file into which to write the resulting offsets.") -} - -case class SerializableAPFMention(docId:String, entId:String, entName:Option[String], mentId:String, mentSpan:(Int, Int), mentHeadSpan:(Int, Int)) { - def serialize:String = Seq(docId, entId, entName.getOrElse(""), mentId, "%s,%s".format(mentSpan._1, mentSpan._2), "%s,%s".format(mentHeadSpan._1,mentHeadSpan._2)).mkString("\t") -} - -object SerializableAPFMention { - def deserialize(str:String):Option[SerializableAPFMention] = str.split("\t") match { - case Array(docId, entId, entNameStr, mentId, mentSpanStr, mentHeadSpanStr) => - val entName = if(entNameStr.isEmpty) None else Some(entNameStr) - val Array(mentStart, mentEnd) = mentSpanStr.split(",") - val Array(mentHeadStart, mentHeadEnd) = mentHeadSpanStr.split(",") - val mentSpan = mentStart.toInt -> mentEnd.toInt - val mentHeadSpan = mentHeadStart.toInt -> mentHeadEnd.toInt - Some(SerializableAPFMention(docId, entId, entName, mentId, mentSpan, mentHeadSpan)) - case _ => None - } - - private val trimRegex = """\n\s+""".r - private def fixMentionString(str:String):String = trimRegex.replaceAllIn(str, "\n") - private def offsetsFromCharSeq(charSeq:Node):(Int, Int) = (charSeq \ "@START").text.toInt -> ((charSeq \ "@END").text.toInt + 1)//these offsets include the xml/sgml of the original file - - def fromAPFXML(xml:Node):Seq[SerializableAPFMention] = { - val docId = (xml \\ "document" \ "@DOCID").text - val mentions = (xml \\ "entity").flatMap{ entNode => - val (entId, entName) = (entNode \ "@ID").text -> (entNode \ "entity_attributes" \ "name" match { - case name if name.nonEmpty => name.head.attribute("NAME").map(a => fixMentionString(a.head.text)) - case _ => None - }) - (entNode \ "entity_mention").map{ mentNode => - val mentId = (mentNode \ "@ID").text - val mentSpan = offsetsFromCharSeq((mentNode \ "extent" \ "charseq").head) // we actually don't need to/can't fix these here - val mentHeadSpan = offsetsFromCharSeq((mentNode \ "head" \ "charseq").head) - SerializableAPFMention(docId, entId, entName, mentId, mentSpan, mentHeadSpan) - } - } - mentions - } -} - -object LoadAPFCoref { - - val TagRegex = new Regex("""<[/\w\d "=]+>""") - def main(args:Array[String]) { - val apfFile = new File("/Users/johnsullivan/data/ace08_eval_sample/CNN889-3.940928.LDC98T25.apf.xml") - val sgmFile = new File("/Users/johnsullivan/data/ace08_eval_sample/CNN889-3.940928.LDC98T25.sgm") - - - val doc = new Document(Source.fromFile(sgmFile).getLines().mkString("\n")) - - - - (DeterministicNormalizingTokenizer.process _ andThen DeterministicSentenceSegmenter.process)(doc) - println("tokenized doc") - val corefAnnotator = new LoadAPFCoref(apfFile) - println("built anno") - - corefAnnotator.process(doc) - println("annotated with coref") - - doc.targetCoref.entities.foreach { ent => - println("Entity: %s".format(ent.canonicalName -> ent.uniqueId)) - ent.mentions.foreach{ ment => - println("\tMention: %s with offsets: %s ".format(ment.phrase.string, ment.phrase.characterOffsets)) - } - } - - } -} -*/ \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/nlp/load/LoadConll2000.scala b/src/main/scala/cc/factorie/app/nlp/load/LoadConll2000.scala deleted file mode 100644 index d560053..0000000 --- a/src/main/scala/cc/factorie/app/nlp/load/LoadConll2000.scala +++ /dev/null @@ -1,214 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.app.nlp.load - -import cc.factorie.app.nlp.pos.PennPosTag -import cc.factorie.app.nlp.{Document, Sentence, Token, UnknownDocumentAnnotator} -import cc.factorie.variable._ - -import scala.io.Source - -/** - * @author John Sullivan - * - * Loads shallow parsing/chunking data from Conll 2000 shared task - * Each sentence becomes a document - * - * 1 token type - * 2 gold POS Tag - * 3 gold chunk (BIO notation default) - */ - -object LoadConll2000 extends Load { - //Default BIO encoding for loadConll2000 from Source since this is the standard encoding for conll2000 training data - def fromSource(source: Source) = fromSource(source,"BIO") - def fromSource(source: Source,encoding:String): Seq[Document] = { - val doc = new Document() - doc.annotators(classOf[Token]) = UnknownDocumentAnnotator.getClass - doc.annotators(classOf[Sentence]) = UnknownDocumentAnnotator.getClass - doc.annotators(classOf[PennPosTag]) = UnknownDocumentAnnotator.getClass - doc.annotators(classOf[BIOChunkTag]) = UnknownDocumentAnnotator.getClass - - //Enable multiple input encodings - val newChunkLabel = encoding match { - case "BILOU" => (t:Token,s:String) => new BILOUChunkTag(t,s) - case "BIO" => (t:Token,s:String) => new BIOChunkTag(t,s) - case "NESTED" => (t:Token,s:String) => new BILOUNestedChunkTag(t,s) - case _ => (t:Token,s:String) => new BIOChunkTag(t,s) - } - var sent = new Sentence(doc) - source.getLines().foreach{ line => - sent = processWordLine(doc, sent, line, newChunkLabel) - } - Seq(doc) - } - - val lineSplit = """([^\s]+) ([^\s]+) ([^\s]+)""".r - val posTranslations = Map("(" -> "-LRB-", ")" -> "-RRB-") - private def processWordLine(doc:Document, sent:Sentence, line:String,newChunkLabel: (Token,String) => ChunkTag):Sentence = line match { - case lineSplit(tokenType, posTagString, chunkTagString) => { - val t = new Token(sent, tokenType + " ") - t.attr += new PennPosTag(t, posTranslations.getOrElse(posTagString, identity(posTagString))) - t.attr += newChunkLabel(t, chunkTagString) - sent - } - case empty if empty.isEmpty => new Sentence(doc) - case otw => throw new Exception("Expected either a line with token pos tag chunk tag, or an empty line, received: %s".format(otw)) - } - - def convertBIOtoBILOU(sentences: Seq[Sentence]){ - for(sentence <- sentences) { - for(token <- sentence.tokens) { - var prev : Token = null - var next : Token = null - if(token.sentenceHasPrev) prev = token.sentencePrev - if(token.sentenceHasNext) next = token.sentenceNext - token.sentenceNext - val newLabel : String = BIOtoBILOU(prev, token, next) - token.attr += new BILOUChunkTag(token, newLabel) - } - } - } - - def BIOtoBILOU(prev : Token, token : Token, next : Token) : String = { - if(token.attr[BIOChunkTag].categoryValue == "O") return "O" - val ts = token.attr[BIOChunkTag].categoryValue.split("-") - var ps : Array[String] = null - var ns : Array[String] = null - if(next != null) - ns = splitLabel(next) - if(prev != null) - ps = splitLabel(prev) - - if(token.attr[BIOChunkTag].categoryValue.contains("B-")) { - if(next == null || ns(1) != ts(1) || ns(0) == "B") - return "U-" + ts(1) - else - return token.attr[BIOChunkTag].categoryValue - } - - if(next == null || ns(1) != ts(1) || ns(0) == "B") - return "L-" + ts(1) - "I-" + ts(1) - - } - - private def splitLabel(token : Token) : Array[String] = { - if(token.attr[BIOChunkTag].categoryValue.contains("-")) - token.attr[BIOChunkTag].categoryValue.split("-") - else - Array("", "O") - } -} - -//Standard conll2000 Chunk Tags -object BIOChunkDomain extends CategoricalDomain[String] { - this ++= Vector("B-ADJP", - "B-ADVP", - "B-CONJP", - "B-INTJ", - "B-LST", - "B-NP", - "B-PP", - "B-PRT", - "B-SBAR", - "B-UCP", - "B-VP", - "I-ADJP", - "I-ADVP", - "I-CONJP", - "I-INTJ", - "I-LST", - "I-NP", - "I-PP", - "I-PRT", - "I-SBAR", - "I-UCP", - "I-VP", - "O") - freeze() -} - -object BILOUChunkDomain extends CategoricalDomain[String] { - this ++= BIOChunkDomain.categories - this ++= Vector( "L-ADVP", - "L-ADJP", - "L-CONJP", - "L-INTJ", - "L-LST", - "L-NP", - "L-PP", - "L-PRT", - "L-SBAR", - "L-UCP", - "L-VP", - "U-ADJP", - "U-ADVP", - "U-CONJP", - "U-INTJ", - "U-LST", - "U-NP", - "U-PP", - "U-PRT", - "U-SBAR", - "U-UCP", - "U-VP") - freeze() -} - -//For Noun Phrase Chunk Tagging -//Requires custom training data tagged in this notation -object BILOUNestedChunkDomain extends CategoricalDomain[String] { - this ++= Vector( "B-NP:B-NP", - "B-NP:I-NP", - "B-NP:L-NP", - "B-NP:U-NP", - "B-NP:O", - "I-NP:B-NP", - "I-NP:I-NP", - "I-NP:L-NP", - "I-NP:U-NP", - "I-NP:O", - "L-NP:B-NP", - "L-NP:I-NP", - "L-NP:L-NP", - "L-NP:U-NP", - "L-NP:O", - "U-NP:B-NP", - "U-NP:I-NP", - "U-NP:L-NP", - "U-NP:U-NP", - "U-NP:O", - "O:B-NP", - "O:I-NP", - "O:L-NP", - "O:U-NP", - "O:O" - ) - freeze() -} - -//This could be combined into a single LabeledCategoricalVariable with a settable domain -abstract class ChunkTag(val token:Token, tagValue:String) extends LabeledCategoricalVariable(tagValue) - -class BIOChunkTag(token:Token, tagValue:String) extends ChunkTag(token, tagValue) { - def domain = BIOChunkDomain -} - -class BILOUChunkTag(token:Token, tagValue:String) extends ChunkTag(token,tagValue) { - def domain = BILOUChunkDomain -} - -class BILOUNestedChunkTag(token:Token, tagValue:String) extends ChunkTag(token,tagValue) { - def domain = BILOUNestedChunkDomain -} \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/nlp/load/LoadConll2002.scala b/src/main/scala/cc/factorie/app/nlp/load/LoadConll2002.scala deleted file mode 100644 index 232bf8d..0000000 --- a/src/main/scala/cc/factorie/app/nlp/load/LoadConll2002.scala +++ /dev/null @@ -1,118 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -package cc.factorie.app.nlp.load -import cc.factorie.app.nlp.{Document, Sentence, Token, UnknownDocumentAnnotator} -import cc.factorie.app.nlp.ner._ -import cc.factorie.util.FastLogging - -import scala.collection.mutable.ArrayBuffer - -// Usage: -// Either LoadConll2002.fromFilename("foo") -// or LoadConll2003(BILOU = true).fromFilename("foo") - -object LoadConll2002 extends LoadConll2002(false) - -case class LoadConll2002(BILOU:Boolean = false) extends Load with FastLogging { - val conllToPennMap = Map("\"" -> "''", "(" -> "-LRB-", ")" -> "-RRB-", "NN|SYM" -> "NN") - - def fromSource(source:io.Source): Seq[Document] = { - import scala.collection.mutable.ArrayBuffer - def newDocument(name:String): Document = { - val document = new Document("").setName(name) - document.annotators(classOf[Token]) = UnknownDocumentAnnotator.getClass // register that we have token boundaries - document.annotators(classOf[Sentence]) = UnknownDocumentAnnotator.getClass // register that we have sentence boundaries -// document.annotators(classOf[pos.PennPosTag]) = UnknownDocumentAnnotator.getClass // register that we have POS tags - document - } - - val documents = new ArrayBuffer[Document] - var document = newDocument("CoNLL2002-"+documents.length) - documents += document - var sentence = new Sentence(document) - for (line <- source.getLines()) { - if (line.length < 2) { // Sentence boundary - document.appendString("\n") - if(sentence.nonEmpty) sentence = new Sentence(document) - } - else { - val fields = line.split(' ') - assert(fields.length == 2) - val word = fields(0) -// val partOfSpeech = conllToPennMap.getOrElse(fields(1), fields(1)) - val ner = fields(1).stripLineEnd - if (sentence.length > 0) document.appendString(" ") - val token = new Token(sentence, word) - token.attr += new LabeledBioConllNerTag(token, ner) -// token.attr += new cc.factorie.app.nlp.pos.PennPosTag(token, partOfSpeech) - } - } - if (BILOU) convertToBILOU(documents) - logger.info("Loaded "+documents.length+" documents with "+documents.map(_.sentences.size).sum+" sentences with "+documents.map(_.tokens.size).sum+" tokens total") - documents - } - def convertToBILOU(documents : ArrayBuffer[Document]) { - for(doc <- documents) { - for(sentence <- doc.sentences) { - for(token <- sentence.tokens) { - val ner = token.nerTag - var prev : Token = null - var next : Token = null - if(token.sentenceHasPrev) prev = token.sentencePrev - if(token.sentenceHasNext) next = token.sentenceNext - token.sentenceNext - val newLabel : String = IOBtoBILOU(prev, token, next) - token.attr += new LabeledBilouConllNerTag(token, newLabel) - } - } - } - } - - def IOBtoBILOU(prev : Token, token : Token, next : Token) : String = { - if(token.nerTag.categoryValue == "O") return "O" - // The major case that needs to be converted is I, which is dealt with here - val ts = token.nerTag.categoryValue.split("-") - var ps : Array[String] = null - var ns : Array[String] = null - if(prev != null) - ps = splitLabel(prev) - if(next != null) - ns = splitLabel(next) - - if(token.nerTag.categoryValue.contains("B-")) { - if(next == null || ns(1) != ts(1) || ns(0) == "B") - return "U-" + ts(1) - else - return token.nerTag.categoryValue - } - - if(prev == null || ps(1) != ts(1)) { - if(next == null || ns(1) != ts(1) || ns(0) == "B") - return "U-" + ts(1) - return "B-" + ts(1) - } - if(next == null || ns(1) != ts(1) || ns(0) == "B") - return "L-" + ts(1) - "I-" + ts(1) - } - - private def splitLabel(token : Token) : Array[String] = { - if(token.nerTag.categoryValue.contains("-")) - token.nerTag.categoryValue.split("-") - else - Array("", "O") - } -} - - diff --git a/src/main/scala/cc/factorie/app/nlp/load/LoadConll2003.scala b/src/main/scala/cc/factorie/app/nlp/load/LoadConll2003.scala index 1c6a267..ec0444a 100644 --- a/src/main/scala/cc/factorie/app/nlp/load/LoadConll2003.scala +++ b/src/main/scala/cc/factorie/app/nlp/load/LoadConll2003.scala @@ -1,19 +1,8 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - package cc.factorie.app.nlp.load -import cc.factorie.app.nlp.{Document, Sentence, Token, UnknownDocumentAnnotator, _} -import cc.factorie.app.nlp.ner._ + +import cc.factorie.app.nlp.ner.LabeledBilouConllNerTag +import cc.factorie.app.nlp.pos.PennPosTag +import cc.factorie.app.nlp.{Document, Sentence, Token, UnknownDocumentAnnotator} import cc.factorie.util.FastLogging import scala.collection.mutable.ArrayBuffer @@ -68,8 +57,8 @@ case class LoadConll2003(BILOU:Boolean = false, verbose:Boolean = false) extends document = new Document().setName("CoNLL2003-" + documents.length) document.annotators(classOf[Token]) = UnknownDocumentAnnotator.getClass // register that we have token boundaries document.annotators(classOf[Sentence]) = UnknownDocumentAnnotator.getClass // register that we have sentence boundaries - document.annotators(classOf[pos.PennPosTag]) = UnknownDocumentAnnotator.getClass // register that we have POS tags - document.annotators(classOf[LabeledBioConllNerTag]) = UnknownDocumentAnnotator.getClass // register that we have IOB NER tags + document.annotators(classOf[PennPosTag]) = UnknownDocumentAnnotator.getClass // register that we have POS tags + //document.annotators(classOf[LabeledBioConllNerTag]) = UnknownDocumentAnnotator.getClass // register that we have IOB NER tags if (BILOU) document.annotators(classOf[LabeledBilouConllNerTag]) = UnknownDocumentAnnotator.getClass // register that we have BILOU NER tags sentence = new Sentence(document) } @@ -81,18 +70,18 @@ case class LoadConll2003(BILOU:Boolean = false, verbose:Boolean = false) extends val ner = fields(3).stripLineEnd if (sentence.length > 0) document.appendString(" ") val token = new Token(sentence, word) - val bioLabel = new LabeledBioConllNerTag(token, ner) - token.attr += bioLabel + //val bioLabel = new LabeledBioConllNerTag(token, ner) + //token.attr += bioLabel - token.attr += new cc.factorie.app.nlp.pos.PennPosTag(token, partOfSpeech) + token.attr += new PennPosTag(token, partOfSpeech) } } // Take care of last document that may have been accumulated if (document.tokenCount > 0){ - if(document.sentences.last.isEmpty) document.asSection -= document.sentences.last - document.asSection.chainFreeze() - documents += document - } + if(document.sentences.last.isEmpty) document.asSection -= document.sentences.last + document.asSection.chainFreeze() + documents += document + } if (BILOU) convertToBILOU(documents) if (verbose) logger.info("Loaded "+documents.length+" documents with "+documents.map(_.sentences.size).sum+" sentences with "+documents.map(_.tokens.size).sum+" tokens total") documents @@ -148,5 +137,3 @@ case class LoadConll2003(BILOU:Boolean = false, verbose:Boolean = false) extends Array("", "O") } } - - diff --git a/src/main/scala/cc/factorie/app/nlp/load/LoadConll2008.scala b/src/main/scala/cc/factorie/app/nlp/load/LoadConll2008.scala deleted file mode 100644 index feff645..0000000 --- a/src/main/scala/cc/factorie/app/nlp/load/LoadConll2008.scala +++ /dev/null @@ -1,174 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -package cc.factorie.app.nlp.load - -import java.io.PrintWriter - -import cc.factorie.app.nlp._ -import cc.factorie.app.nlp.lemma.TokenLemma -import cc.factorie.app.nlp.parse.ParseTree -import cc.factorie.app.nlp.pos.{LabeledSpanishPosTag, PennPosTag, PosTag} -import cc.factorie.util.FastLogging - -import scala.io.Source - -/* - * Loader for the CoNLL 2008 closed-track shared task data. - * wordIndex word lemma POS parentIndex depLabel - * Details on the format are available at http://barcelona.research.yahoo.net/dokuwiki/doku.php?id=conll2008:format - * - * @author Brian Martin - */ - -object LoadSpanishConll2008 extends LoadConll2008(classOf[pos.SpanishPosTag]) { - def makePosTag(token: Token, partOfSpeech: String): PosTag = - new LabeledSpanishPosTag(token, partOfSpeech) -} - -object LoadConll2008 extends LoadConll2008(classOf[pos.PennPosTag]) { - def makePosTag(token: Token, partOfSpeech: String): PosTag = - new PennPosTag(token, partOfSpeech) -} - -abstract class LoadConll2008(val posType: Class[_]) extends Load with FastLogging { - - def makePosTag(token: Token, partOfSpeech: String): PosTag - - private def addDepInfo(s: Sentence, depInfoSeq: Seq[(Int, Int, String)]): Unit = { - val tree = new ParseTree(s) - for ((childIdx, parentIdx, depLabel) <- depInfoSeq) { - tree.setParent(childIdx, parentIdx) - tree.label(childIdx).setCategory(depLabel)(null) - } - s.attr += tree - } - - var loadLemma = true - - def fromFilename(filename: String): Seq[Document] = { - fromSource(Source.fromFile(filename)) - } - - def fromSource(source: Source): Seq[Document] = { - val document: Document = new Document - document.annotators(classOf[Token]) = UnknownDocumentAnnotator.getClass // register that we have token boundaries - document.annotators(classOf[Sentence]) = UnknownDocumentAnnotator.getClass // register that we have sentence boundaries - document.annotators(posType) = UnknownDocumentAnnotator.getClass // register that we have POS tags - if (loadLemma) document.annotators(classOf[TokenLemma]) = UnknownDocumentAnnotator.getClass // register that we have lemma - var sentence: Sentence = new Sentence(document) -// var depInfoSeq = new collection.mutable.ArrayBuffer[(Int, Int, String)] - for (line <- source.getLines()) { - if (line.length < 2) { - // Sentence boundary - document.appendString("\n") -// addDepInfo(sentence, depInfoSeq) -// depInfoSeq = new collection.mutable.ArrayBuffer[(Int, Int, String)] - sentence = null - } else { - if (sentence eq null) - sentence = new Sentence(document) // avoids empty sentence at the end of doc - val fields = line.split('\t') - assert(fields.length >= 10) - val currTokenIdx = fields(0).toInt - 1 - val word = fields(1) - val lemma = fields(2) - val partOfSpeech = fields(3) - val parentIdx = fields(8).toInt - 1 - val depLabel = fields(9) - document.appendString(" ") - val token = new Token(sentence, word) - token.attr += makePosTag(token, partOfSpeech) - if (loadLemma) - token.attr += new TokenLemma(token, lemma) // TODO Change this to some more specific TokenLemma subclass -// depInfoSeq.append((currTokenIdx, parentIdx, depLabel)) - } - } -// if (sentence ne null) -// addDepInfo(sentence, depInfoSeq) - - println("Loaded 1 document with " + document.sentences.size + " sentences with " + document.asSection.length + " tokens total from file.") - Seq(document) - } - - def printDocument(d: Document) = - for (s <- d.sentences) - println(s.attr[ParseTree].toString() + "\n") - -// def main(args: Array[String]) = -// for (filename <- args) -// printDocument(fromFilename(filename).head) - -} - - -object WriteConll2008 { - - // if the source file is given, then include the fields that we don't know anything about - // otherwise just give underscores for info we don't know. - def toFile(outputFile: String, document: Document, sourceFile: String = null): Unit = { - val source = { - if (sourceFile eq null) None else Some(Source.fromFile(sourceFile).getLines()) - } - val sentences = document.sentences.iterator - val writer = new PrintWriter(outputFile) - var sentence: Sentence = sentences.next() - var currTokenIdx = 0 - var tree: ParseTree = sentence.parse - while (true) { - if (currTokenIdx == sentence.length) { - writer.println() - if (sentences.hasNext) { - source match { - case Some(source) => source.next() - case _ => () - } - sentence = sentences.next() - tree = sentence.parse - currTokenIdx = 0 - } - else { - writer.close() - return - } - } - else { - val field8 = "" + (tree.parentIndex(currTokenIdx) + 1) - val field9 = "" + { - val category = tree.label(currTokenIdx).categoryValue; - if (category == "") "_" else category - } - val fields = source match { - case None => { - val x = Array.fill[String](10)("_") - x(0) = "" + (currTokenIdx + 1) - x(1) = sentence.tokens(currTokenIdx).string - x(3) = sentence.tokens(currTokenIdx).posTag.categoryValue - x(8) = field8 - x(9) = field9 - x - } - case Some(source) => { - val x = source.next().split("\t") - x(8) = field8 - x(9) = field9 - x - } - } - currTokenIdx += 1 - writer.println(fields.mkString("\t")) - } - } - } - -} diff --git a/src/main/scala/cc/factorie/app/nlp/load/LoadConll2011.scala b/src/main/scala/cc/factorie/app/nlp/load/LoadConll2011.scala deleted file mode 100644 index 9ceeb21..0000000 --- a/src/main/scala/cc/factorie/app/nlp/load/LoadConll2011.scala +++ /dev/null @@ -1,328 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.app.nlp.load - -import cc.factorie.app.nlp._ -import cc.factorie.app.nlp.pos.{PennPosDomain, PennPosTag} -import cc.factorie.variable.Span - - -//import cc.factorie.app.nlp.coref.mention.{MentionEntityType, MentionList, Mention, Entity} -import cc.factorie.app.nlp.coref._ -import cc.factorie.app.nlp.phrase.{OntonotesPhraseEntityType, Phrase} - -import scala.collection.mutable -import scala.collection.mutable.{ArrayBuffer, Map} -import scala.util.control.Breaks._ - -class EntityKey(val name: String) - -object LoadConll2011 { - - //this is used when loading gold entity type annotation. If this variable is set to true, the loader - // only uses the entity type if its boundaries exactly match the boundaries of the annotated mention - val useExactEntTypeMatch = false - - // to be used with test-with-gold-mention-boundaries - val autoFileFilter = new java.io.FileFilter() { - override def accept(file: java.io.File): Boolean = - file.getName.endsWith("auto_conll") - } - - // to be used with test-key - val goldFileFilter = new java.io.FileFilter() { - override def accept(file: java.io.File): Boolean = - file.getName.endsWith("gold_conll") - } - - - - @inline def unescapeBrackets(s: String) = - s match { - case "-LRB-" => "(" - case "-RRB-" => ")" - case "-LSB-" => "[" - case "-RSB-" => "]" - case "-LCB-" => "{" - case "-RCB-" => "}" - case _ => s - } - - //(15|(43 - final val copularVerbs = collection.immutable.HashSet[String]() ++ Seq("is","are","was","'m") - //val openEntity = """\( (\d+)""".r - val singleLineEntity = """""" - val tokenizer = """(\(|\||\)|\d+)""".r - val entityTypeTokenizer = """(\(|[^\)]+|\)|)""".r - - //val corefEntityTokenizer = """(\(|[^\)]+|\)|)""".r - - val asteriskStripper = """\*""".r - - private def tokenizeEntityType(s: String): Array[String] = { - entityTypeTokenizer.findAllIn(s).map(x => asteriskStripper.replaceAllIn(x,"")).map(_.toString).toArray - } - - // disperseEntityTypes optionally gives entity type information to all things that are coreferent with something that has entity type annotation - //2 Documents in Train: 161.5 mentions/doc - def loadWithParse(f: String, loadSingletons: Boolean = true, limitNumDocuments:Int = -1, callDisperseEntityTypes:Boolean = false): Seq[Document] = { - // println("loading " + f) - val docs = ArrayBuffer[Document]() - - var coref: WithinDocCoref = null - var currDoc: Document = null - var currSent: Sentence = null - var currEntId: Int = 0 - var docTokInd: Int = -1 - var numMentions = 0 // total number mentions in a document - val entities = Map[String, WithinDocEntity]() - var sentenceId: Int = -1 - var tokenId: Int = -1 - - val parseStack = collection.mutable.Stack[(String,Int)]() - var currParseTree:ConstituencyParse = null - - val source = scala.io.Source.fromFile(f) - var prevPhrase = "" - var prevWord = "" - - val goldMentionBoundaries = new scala.collection.mutable.LinkedHashMap[Span[Section,Token],CoreferentEntityChunk] - val _spanToEntityType = new scala.collection.mutable.LinkedHashMap[Span[Section,Token],String] - var unResolvedEntityType:EntityTypeChunk = null - - val openEntityStack = mutable.Stack[CoreferentEntityChunk]() - - breakable { for (l <- source.getLines()) { - if (l.startsWith("#begin document ")) { - if (docs.length == limitNumDocuments) break() - val fId = l.split("[()]")(1) + "-" + l.takeRight(3) - currDoc = new Document("").setName(fId) - currDoc.getCoref - coref = currDoc.getTargetCoref // This also puts a newly created WithinDocCoref in currDoc.attr. - currDoc.annotators(classOf[Token]) = UnknownDocumentAnnotator.getClass // register that we have token boundaries - currDoc.annotators(classOf[Sentence]) = UnknownDocumentAnnotator.getClass // register that we have token boundaries - //currDoc.attr += new FileIdentifier(fId, true, fId.split("/")(0), "CoNLL") - docs += currDoc - } else if (l.startsWith("#end document")) { - coref = null - currDoc = null - currEntId = 0 - - _spanToEntityType.clear() - goldMentionBoundaries.clear() - openEntityStack.clear() - entities.clear() - parseStack.clear() - docTokInd = -1 - sentenceId = -1 - tokenId = -1 - - } else if (l.length == 0) { - currDoc.appendString("\n") - parseStack.clear() - currSent = null - } else { - docTokInd += 1 - val fields = l.split("\\s+") - val tokId = fields(2).toInt - val word = unescapeBrackets(fields(3)) - currDoc.appendString(" ") - if (tokId == 0) { - currSent = new Sentence(currDoc) - currParseTree = new ConstituencyParse(currSent,0,"TOP") - prevPhrase = "" - prevWord = "" - } - val token = new Token(currSent, word) - PennPosDomain.unfreeze() //todo: factorie PennPosDomain currently contains all of the ontonotes tags. Might want to freeze this up for thread safety - token.attr += new PennPosTag(token,fields(4)) - tokenId += 1 - if (tokId == 0) sentenceId += 1 - - val entityTypeTokens = tokenizeEntityType(fields(10)).filterNot(_.isEmpty) - entityTypeTokens match { - case Array("(",entityTypeString:String,")") => _spanToEntityType.put(new TokenSpan(currSent.section,docTokInd,1).value,entityTypeString) //todo:Don't forget to change this to new span - case Array("(",entityTypeString) => - assert(unResolvedEntityType eq null,"Nested Entity Types Found") - unResolvedEntityType = new EntityTypeChunk(entityTypeString,docTokInd) - case Array(")") => - _spanToEntityType.put(new TokenSpan(currSent.section,unResolvedEntityType.start,docTokInd-unResolvedEntityType.start+1).value,unResolvedEntityType.entityType) - unResolvedEntityType = null - case _ => - } - - val entityLabels = fields.last.split('|').map(_.trim) - for(label <- entityLabels){ - val corefTags = tokenizeEntityType(label).filterNot(l => l.isEmpty) - corefTags match { - case Array("(",entityId,")") => goldMentionBoundaries.put(new Span(currSent.section,docTokInd,1),new CoreferentEntityChunk(fields(0)+"-*"+entityId,docTokInd)) - case Array("(",entityId) => openEntityStack.push(new CoreferentEntityChunk(fields(0)+"-*"+entityId,docTokInd)) - case Array(entityId,")") => - val lastOpenedEntity = openEntityStack.pop() - goldMentionBoundaries.put(new TokenSpan(currSent.section,lastOpenedEntity.mentionStart,docTokInd - lastOpenedEntity.mentionStart + 1).value,lastOpenedEntity) - case _ => - } - } - - val constituencyLabels = fields(5).split("\\*") - if (constituencyLabels.length >= 1 && loadSingletons) { - val bracketOpens = constituencyLabels(0) - val bracketCloses = if (constituencyLabels.length > 1) constituencyLabels(1) else "" - for (nonTerminal <- bracketOpens.split("\\(").drop(1)) { - parseStack.push((nonTerminal, docTokInd)) - currParseTree.addChild(nonTerminal,docTokInd) - } - for (close <- bracketCloses) { - val (phrase, start) = parseStack.pop() - val parentPhrase = if(!parseStack.isEmpty) parseStack(0)._1 else "" - //if(Vector("NP","PRP","PP").contains(phrase)) - currParseTree.current.setEnd(docTokInd) - if (phrase == "NP") { - val span = new TokenSpan(currDoc.asSection, start, docTokInd - start + 1) - val newMention = coref.addMention(new Phrase(span, span.tokens.indexOf(currParseTree.current.getHeadToken(docTokInd)))) - numMentions += 1 - currParseTree.closeLabel(docTokInd) - - val entityTypesForSpan = _spanToEntityType.filterKeys(span.value.contains) - if(!entityTypesForSpan.isEmpty){ - val exactMatch = entityTypesForSpan.find(entitySpan => (entitySpan._1.start == start) && (entitySpan._1.end == docTokInd) ) - val exactMatchExists = exactMatch ne null - if (!useExactEntTypeMatch ||(useExactEntTypeMatch && exactMatchExists)) - newMention.phrase.attr += new OntonotesPhraseEntityType(newMention.phrase, entityTypesForSpan.find(s => s._1.exists(t=> t == newMention.phrase.headToken)).getOrElse(entityTypesForSpan.head)._2,exactMatchExists) - else - newMention.phrase.attr += new OntonotesPhraseEntityType(newMention.phrase, "O",exactMatchExists) - } else - newMention.phrase.attr += new OntonotesPhraseEntityType(newMention.phrase, "O") - - val entityChunkForMention = goldMentionBoundaries.getOrElse(newMention.phrase.value,new CoreferentEntityChunk(fields(0)+"-"+(-coref.mentions.size),start,true)) - //Register that we have found this mention - entityChunkForMention.found = true - newMention.attr += new EntityKey(entityChunkForMention.entityId) - val corefEntity = entities.getOrElseUpdate(entityChunkForMention.entityId,coref.entityFromUniqueId(entityChunkForMention.entityId)) - corefEntity += newMention - }else currParseTree.closeLabel(docTokInd) - prevPhrase = phrase - } - } - //this makes mentions for the ground truth mentions that weren't found by the NP, PRP Rules - for ((goldMentionSpan,goldMentionEntityInfo) <- goldMentionBoundaries.filter{case (mentionSpan,mentionEntityInfo) => !mentionEntityInfo.found}) { - //assert(currParseTree.current.parent.start == start,"Not in Parent") - val span = new TokenSpan(currDoc.asSection, goldMentionSpan.start, goldMentionSpan.length) - val newMention = coref.addMention(new Phrase(span, getSimpleHeadToken(span))) - val entityTypesForSpan = _spanToEntityType.filterKeys(span.value.contains) - if(!entityTypesForSpan.isEmpty){ - val exactMatch = entityTypesForSpan.getOrElse(span.value,null)//.find(entitySpan => (entitySpan._1.start == start) && (entitySpan._1.end == docTokInd) ) - val exactMatchExists = exactMatch ne null - if (!useExactEntTypeMatch ||(useExactEntTypeMatch && exactMatchExists)) - newMention.phrase.attr += new OntonotesPhraseEntityType(newMention.phrase, entityTypesForSpan.find(s => s._1.exists(t=> t == newMention.phrase.headToken)).getOrElse(entityTypesForSpan.head)._2,exactMatchExists) - else - newMention.phrase.attr += new OntonotesPhraseEntityType(newMention.phrase, "O",exactMatchExists) - } else - newMention.phrase.attr += new OntonotesPhraseEntityType(newMention.phrase, "O") - - numMentions += 1 - - val entityChunkForMention = goldMentionBoundaries.getOrElse(newMention.phrase.value,new CoreferentEntityChunk(fields(0)+"-"+coref.mentions.size+1,goldMentionSpan.start,true)) - entityChunkForMention.found = true - newMention.attr += new EntityKey(entityChunkForMention.entityId) - val corefEntity = entities.getOrElseUpdate(entityChunkForMention.entityId,coref.entityFromUniqueId(entityChunkForMention.entityId)) - corefEntity += newMention - - } - prevWord = word - } - - }} // closing "breakable" - if (callDisperseEntityTypes) disperseEntityTypes(docs.map(_.getTargetCoref)) - source.close() - docs - } - - case class CoreferentEntityChunk(entityId:String,mentionStart:Int,var found:Boolean = false) - case class EntityTypeChunk(entityType:String, start:Int) - - def disperseEntityTypes(corefDocs:Seq[WithinDocCoref]):Unit = { - for(corefDoc <- corefDocs){ - val entities = corefDoc.mentions.toSeq.groupBy(m => m.entity).filter(x => x._2.length > 1) - for(ent <- entities){ - val entityTypes = ent._2.map(m => m.phrase.attr[OntonotesPhraseEntityType].categoryValue).filter(t => t != "O").distinct - if(entityTypes.length > 1){ - // println("warning: there were coreferent mentions with different annotated entity types: " + entityTypes.mkString(" ") + "\n" + ent._2.map(m => m.span.string).mkString(" ")) - }else if(entityTypes.length == 1){ - val newType = entityTypes(0) - ent._2.foreach(m => m.phrase.attr[OntonotesPhraseEntityType].target.setCategory(newType)(null)) - } - } - } - } - - /**This is a span-level offset. Since we don't have a dep parse, we just take the final noun in the span */ - def getSimpleHeadToken(span: TokenSpan): Int = { - //val interiorNP = parseTree.current.children.find(_.label == "NP") - val toReturn = span.value.lastIndexWhere(_.posTag.categoryValue.startsWith("NN")) - //val allNP = span.value.filter(_.posTag.categoryValue.startsWith("NN")).map(_.string).toSeq - if(toReturn == -1){ - span.length - 1 - }else{ - toReturn - } - } -} - - - - -class ConstituencyParse(val sent: Sentence,rootStart:Int,rootLabel:String){ - var current = new ConstLabel(rootLabel,rootStart) - def addChild(label:String,start:Int) = { - val newChild = new ConstLabel(label,start,current) - current.children += newChild - current = newChild - } - def closeLabel(end:Int){ - current.setEnd(end) - current = current.parent - } - - class ConstLabel(val label:String,val start:Int,parentNode:ConstLabel = null){ - val parent:ConstLabel = parentNode - val children:ArrayBuffer[ConstLabel] = new ArrayBuffer[ConstituencyParse.this.type#ConstLabel]() - var endIdx:Int = -1 - var span:TokenSpan = null - def setEnd(end:Int) = { - span = new TokenSpan(sent.section,start,end - start + 1) - endIdx = end - } - def getHeadToken(docTokInd:Int):Token ={ - val childNP = children.filter(_.label == "NP") - val possNP = span.tokens.find(_.posTag.intValue == PennPosDomain.posIndex) - if(possNP.isDefined && possNP.get != span.last && possNP.get.next.posTag.categoryValue.startsWith("N")) { - return possNP.get.next - } - else if(!childNP.isEmpty) childNP.head.getHeadToken(docTokInd) - else { - span.value.foreach(t=>assert(t.posTag != null)) - val lastIndexOfNoun = span.value.lastIndexWhere(_.posTag.categoryValue.startsWith("NN")) - if(lastIndexOfNoun == -1 && span!=null) { - //println("** Head Error: " + span.string+" "+span.last.string) - span.last - } - else span.tokens(lastIndexOfNoun) - } - } - } -} - - - - diff --git a/src/main/scala/cc/factorie/app/nlp/load/LoadConllCoreference.scala b/src/main/scala/cc/factorie/app/nlp/load/LoadConllCoreference.scala deleted file mode 100644 index fadaf9c..0000000 --- a/src/main/scala/cc/factorie/app/nlp/load/LoadConllCoreference.scala +++ /dev/null @@ -1,351 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.app.nlp.load - -import java.io.File - -import cc.factorie.app.nlp._ -import cc.factorie.app.nlp.coref.{Mention, WithinDocCoref, WithinDocEntity} -import cc.factorie.app.nlp.phrase.{OntonotesPhraseEntityType, Phrase} -import cc.factorie.app.nlp.pos.PennPosTag -import cc.factorie.variable.Span - -import scala.collection.mutable -import scala.collection.mutable.{ArrayBuffer, Map} - -/** - * Loader for conll 2011 or conll 2012, call separately for training data and test data if both are needed - * If auto data file or directory is NOT supplied: - * if loadFromParse = False then gold mentions grouped as their true entities annotated in the goldFile are loaded onto the Document.targetCoref object - * if loadFromParse = True then gold mentions grouped as their true entities and all noun phrases, added as singletons, are loaded onto the targetCoref - * If autoDirectory or File IS supplied: - * Gold annotated mentions grouped as entities are loaded from the goldFile given onto the Document.targetCoref - * if loadFromParse = False then annotated predicted mentions in the autoDir/autoFile will be loaded as singletons onto Document.coref to be coreferred - * if loadFromParse = True then all noun phrases and pronouns from the predicted constituency parse will be added as singletons to the Document.coref - * - * goldFilename String path to flattened conll key file with gold coreference annotation, if only annotation and not testing is wanted, the flattened auto filename can be given here as well - * limitNumDocuments Int count of documents to load - * autoDirOpt Option[String] (Optional) Directory path or file path to conll_auto data - */ - -object LoadConllCoreference { - def load(goldFilename: String, limitNumDocuments: Int, loadFromParse: Boolean, autoDirOpt:Option[String] = None): Seq[Document] = { - val conllLoader = new Conll2011Iterator(goldFilename, loadFromParse, autoDirOpt.map {new File(_)}) - val docs = new ArrayBuffer[Document]() - while (conllLoader.hasNext && (limitNumDocuments == -1 || docs.size < limitNumDocuments)) { - docs += conllLoader.next() - } - disperseEntityTypes(docs.map(_.getTargetCoref)) - if (autoDirOpt.isDefined) disperseEntityTypes(docs.map(_.getCoref)) - docs - } - - def disperseEntityTypes(corefDocs:Seq[WithinDocCoref]):Unit = { - for (corefDoc <- corefDocs) { - val entities = corefDoc.mentions.toSeq.groupBy(m => m.entity).filter(x => x._2.length > 1) - for (ent <- entities) { - val entityTypes = ent._2.map(m => m.phrase.attr[OntonotesPhraseEntityType].categoryValue).filter(t => t != "O").distinct - if (entityTypes.length > 0) { - //Note, this takes the first entity type in case of within cluster entity type agreement - val newType = entityTypes(0) - ent._2.foreach(m => m.phrase.attr[OntonotesPhraseEntityType].target.setCategory(newType)(null)) - } - } - } - } -} - -class Conll2011Iterator(goldFile: String, loadFromParse: Boolean = true, autoDirOpt:Option[File] = None) extends Iterator[Document] { - private val goldDocs = new ConllOWPLIterator(goldFile) - private val useExactEntTypeMatch = true - private val autoMapOpt = autoDirOpt.map { autoDir => - if(autoDir.isDirectory) { - autoDir.listFiles().flatMap {autoFile => new ConllOWPLIterator(autoFile.getAbsolutePath).toSeq}.toMap - } else { - new ConllOWPLIterator(autoDir.getAbsolutePath).toMap - } - } - - def next() = { - val (id, goldLines) = goldDocs.next() - val doc = new Document() - doc.setName(id) - - val autoLinesOpt = autoMapOpt.flatMap(_.get(id)) - - doc.getCoref - - var docTokIdx = -1 - var sentenceIdx = -1 - var currSentence: Sentence = null - - val goldAnnotationResolver = new DocumentMentionBoundariesResolver(doc.getTargetCoref, key = true) - val autoAnnotationResolver: DocumentMentionBoundariesResolver = if (autoLinesOpt.isDefined) new DocumentMentionBoundariesResolver(doc.getCoref, key = false) else null - - while (goldLines.hasNext) { - val goldLine = goldLines.next() - val autoLine: Option[String] = autoLinesOpt.map { - _.next() - } - - if (goldLine == "") { - doc.appendString("\n") - } else if (goldLine != "#end document") { - docTokIdx += 1 - val goldFields = goldLine.split("\\s+") - val tokId = goldFields(2).toInt - val word = unescapeBrackets(goldFields(3)) - doc.appendString(" ") - if (tokId == 0) { - currSentence = new Sentence(doc) - goldAnnotationResolver.createNewSentence(currSentence) - if (autoLinesOpt.isDefined) autoAnnotationResolver.createNewSentence(currSentence) - sentenceIdx += 1 - } - - val token = new Token(currSentence, word) - token.attr += new PennPosTag(token, goldFields(4)) - token.attr += new MentionSpeaker(goldFields(9)) - - goldAnnotationResolver.storeNEREntityChunks(goldFields(10), currSentence, token, docTokIdx) - goldAnnotationResolver.storeCorefEntityChunk(goldFields.last, currSentence, docTokIdx, doc.name) - - if (autoLinesOpt.isDefined) { - val autoFields = autoLine.get.split("\\s+") - val autoWord = unescapeBrackets(autoFields(3)) - if(autoWord != word) - println( "Gold Document and Auto Document are out of sync") - val constituencyLabels = autoFields(5).split("\\*") - if (constituencyLabels.length >= 1 && loadFromParse) { - if (token.posTag.categoryValue == "PRP" || token.posTag.categoryValue == "PRP$") autoAnnotationResolver.createPRPMentions(token, docTokIdx) - autoAnnotationResolver.createNPMentionsFromParse(constituencyLabels, docTokIdx) - } else if (!loadFromParse) { - autoAnnotationResolver.resolveAnnotatedMentions(docTokIdx) - } - autoAnnotationResolver.prevWord = word - } else { - val constituencyLabels = goldFields(5).split("\\*") - if (constituencyLabels.length >= 1 && loadFromParse) { - if (token.posTag.categoryValue == "PRP" || token.posTag.categoryValue == "PRP$") goldAnnotationResolver.createPRPMentions(token, docTokIdx) - goldAnnotationResolver.createNPMentionsFromParse(constituencyLabels, docTokIdx) - } - } - goldAnnotationResolver.resolveAnnotatedMentions(docTokIdx) - goldAnnotationResolver.prevWord = word - } - } - doc - } - - val tokenizer = """(\(|\||\)|\d+)""".r - val entityTypeTokenizer = """(\(|[^\)]+|\)|)""".r - val asteriskStripper = """\*""".r - - @inline def unescapeBrackets(s: String) = - s match { - case "-LRB-" => "(" - case "-RRB-" => ")" - case "-LSB-" => "[" - case "-RSB-" => "]" - case "-LCB-" => "{" - case "-RCB-" => "}" - case _ => s - } - - class DocumentMentionBoundariesResolver(coref: WithinDocCoref, key: Boolean = true, loadFromParse: Boolean = false) { - val mentionBoundaries = new scala.collection.mutable.LinkedHashMap[Span[Section, Token], CoreferentEntityChunk] - val useEntityType = key - val _spanToEntityType = new scala.collection.mutable.LinkedHashMap[Span[Section, Token], String] - var unResolvedEntityType: EntityTypeChunk = null - val entities = Map[String, WithinDocEntity]() - - val parseStack = collection.mutable.Stack[(String, Int)]() - var currParseTree: ConstituencyParse = null - - val openEntityStack = mutable.Map[String, collection.mutable.Stack[CoreferentEntityChunk]]() - - var numMentions = 0 - var prevPhrase = "" - var prevWord = "" - - case class CoreferentEntityChunk(entityId: String, mentionStart: Int, var found: Boolean = false) - case class EntityTypeChunk(entityType: String, start: Int) - - private def tokenizeEntityType(s: String): Array[String] = { - entityTypeTokenizer.findAllIn(s).map(x => asteriskStripper.replaceAllIn(x, "")).map(_.toString).toArray - } - - def createNewSentence(newSentence: Sentence) = { - parseStack.clear() - currParseTree = new ConstituencyParse(newSentence,0,"TOP") - prevPhrase = "" - prevWord = "" - } - - def storeNEREntityChunks(nerChunk: String, sentence: Sentence, token: Token, docTokIdx: Int): Unit = { - val entityTypeTokens = tokenizeEntityType(nerChunk).filterNot(_.isEmpty) - entityTypeTokens match { - case Array("(", entityTypeString: String, ")") => _spanToEntityType.put(new TokenSpan(sentence.section, docTokIdx, 1).value, entityTypeString) - case Array("(", entityTypeString) => - assert(unResolvedEntityType eq null, "Nested Entity Types Found") - unResolvedEntityType = new EntityTypeChunk(entityTypeString, docTokIdx) - case Array(")") => - _spanToEntityType.put(new TokenSpan(sentence.section, unResolvedEntityType.start, docTokIdx - unResolvedEntityType.start + 1).value, unResolvedEntityType.entityType) - unResolvedEntityType = null - case _ => - } - } - - def createPRPMentions(token: Token, docTokIdx: Int): Unit = { - val span = new TokenSpan(coref.document.asSection, docTokIdx, 1) - val newMention = coref.addMention(new Phrase(span, 0))//span.tokens.indexOf(currParseTree.current.getHeadToken(docTokIdx)))) - numMentions += 1 - - val entityChunkForMention = mentionBoundaries.getOrElse(newMention.phrase.value, new CoreferentEntityChunk(coref.document.name + "-" + (-coref.mentions.size), docTokIdx, true)) - //Register that we have found this mention - entityChunkForMention.found = true - val entityKey = if (key) entityChunkForMention.entityId - else coref.document.name + "-" + (-coref.mentions.size) - newMention.attr += new EntityKey(entityKey) - - val (entityTypeLabel, exactTypeExists) = getNEREntityType(newMention, docTokIdx) - newMention.phrase.attr += new OntonotesPhraseEntityType(newMention.phrase, entityTypeLabel, exactTypeExists) - - val corefEntity = coref.entityFromUniqueId(entityKey) - corefEntity += newMention - } - - def createNPMentionsFromParse(constituencyLabels: Array[String], docTokIdx: Int): Unit = { - val bracketOpens = constituencyLabels(0) - val bracketCloses = if (constituencyLabels.length > 1) constituencyLabels(1) else "" - for (nonTerminal <- bracketOpens.split("\\(").drop(1)) { - parseStack.push((nonTerminal, docTokIdx)) - currParseTree.addChild(nonTerminal, docTokIdx) - } - for (close <- bracketCloses) { - val (phrase, start) = parseStack.pop() - val parentPhrase = if (parseStack.nonEmpty) parseStack(0)._1 else "" - currParseTree.current.setEnd(docTokIdx) - if (phrase == "NP") { - val span = new TokenSpan(coref.document.asSection, start, docTokIdx - start + 1) - val newMention = coref.addMention(new Phrase(span, -1)) - numMentions += 1 - currParseTree.closeLabel(docTokIdx) - - val entityChunkForMention = mentionBoundaries.getOrElse(newMention.phrase.value, new CoreferentEntityChunk(coref.document.name + "-" + (-coref.mentions.size), start, true)) - //Register that we have found this mention - entityChunkForMention.found = true - //Assign mention to an entity cluster - val entityKey = if (key) entityChunkForMention.entityId - else coref.document.name + "-" + (-coref.mentions.size) - newMention.attr += new EntityKey(entityKey) - val corefEntity = coref.entityFromUniqueId(entityKey) - corefEntity += newMention - //Set OntoNotesEntityTYpe - val (entityTypeLabel, exactTypeExists) = getNEREntityType(newMention, docTokIdx) - newMention.phrase.attr += new OntonotesPhraseEntityType(newMention.phrase, entityTypeLabel, exactTypeExists) - } else currParseTree.closeLabel(docTokIdx) - prevPhrase = phrase - } - } - - def storeCorefEntityChunk(entityLabel: String, sentence: Sentence, docTokIdx: Int, docId: String): Unit = { - val entityLabels = entityLabel.split('|').map(_.trim) - for (label <- entityLabels) { - val corefTags = tokenizeEntityType(label).filterNot(l => l.isEmpty) - corefTags match { - case Array("(", entityId, ")") => mentionBoundaries.put(new Span[Section,Token](sentence.section, docTokIdx, 1), new CoreferentEntityChunk(docId + "-*" + entityId, docTokIdx)) - case Array("(", entityId) => if (openEntityStack.contains(entityId)) - openEntityStack.get(entityId).get.push(new CoreferentEntityChunk(docId + "-*" + entityId, docTokIdx)) - else - openEntityStack.put(entityId, new mutable.Stack[CoreferentEntityChunk]().push(CoreferentEntityChunk(docId + "-*" + entityId, docTokIdx))) - case Array(entityId, ")") => { - val lastOpenedEntityStack = openEntityStack.get(entityId).get - val lastOpenedEntity = lastOpenedEntityStack.pop() - mentionBoundaries.put(new TokenSpan(sentence.section, lastOpenedEntity.mentionStart, docTokIdx - lastOpenedEntity.mentionStart + 1).value, lastOpenedEntity) - } - case _ => - } - } - } - - def getNEREntityType(mention: Mention, docTokIdx: Int): (String, Boolean) = { - var (entityTypeLabel, exactMatchExists) = ("O", false) - //Find all entity type spans within the given mention - val entityTypesForSpan = _spanToEntityType.filterKeys(mention.phrase.value.contains) - if (entityTypesForSpan.nonEmpty && useEntityType) { - val exactMatch = entityTypesForSpan.find(entitySpan => (entitySpan._1.start == mention.phrase.start) && (entitySpan._1.end == docTokIdx)) - exactMatchExists = exactMatch.isDefined - if (exactMatch.isDefined) { - entityTypeLabel = exactMatch.get._2 - } - else if (!useExactEntTypeMatch) { - val headEntityType = entityTypesForSpan.find(s => s._1.exists(t => t == mention.phrase.headToken)) - if (headEntityType.isDefined) entityTypeLabel = headEntityType.get._2 - } - } - (entityTypeLabel, exactMatchExists) - } - - def resolveAnnotatedMentions(docTokIdx: Int) { - for ((goldMentionSpan, goldMentionEntityInfo) <- mentionBoundaries.filter { case (mentionSpan, mentionEntityInfo) => !mentionEntityInfo.found}) { - //assert(currParseTree.current.parent.start == start,"Not in Parent") - val newMention = coref.addMention(new Phrase(coref.document.asSection, goldMentionSpan.start, goldMentionSpan.length, -1)) - //Find and add an OntonotesPhraseEntityType - val (entityTypeLabel, exactTypeExists) = getNEREntityType(newMention, docTokIdx) - newMention.phrase.attr += new OntonotesPhraseEntityType(newMention.phrase, entityTypeLabel, exactTypeExists) - numMentions += 1 - //Assign mention to entity - val entityChunkForMention = mentionBoundaries.getOrElse(newMention.phrase.value, new CoreferentEntityChunk(coref.document.name + "-" + coref.mentions.size + 1, goldMentionSpan.start, true)) - entityChunkForMention.found = true - val entityKey = - if (key) entityChunkForMention.entityId - else coref.document.name + "-" + coref.mentions.size + 1 - newMention.attr += new EntityKey(entityKey) - val corefEntity = coref.entityFromUniqueId(entityKey) - corefEntity += newMention - } - } - } - - def hasNext = goldDocs.hasNext -} - -protected class ConllOWPLIterator(file:String) extends Iterator[(String, Iterator[String])] { - private val source = scala.io.Source.fromFile(file).getLines() - - private val DocStart = """#begin document \(([^)]+)\).*""".r - private val DocEnd = """#end document.*""".r - - private var currentLine = source.next() - - def next() = { - assert(DocStart.findFirstMatchIn(currentLine).isDefined, "Failed to find start of document") - val DocStart(id) = currentLine - val fullId = id+"-"+currentLine.takeRight(3) - val lines = mutable.ArrayBuffer[String]() - while(!DocEnd.findFirstMatchIn(currentLine).isDefined && source.hasNext) { - currentLine = source.next() - lines += currentLine - } - while(!DocStart.findFirstMatchIn(currentLine).isDefined && source.hasNext) currentLine = source.next() - fullId -> lines.toIterator - } - - def hasNext = { - val res = source.hasNext - res - } -} - -case class MentionSpeaker(name: String) \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/nlp/load/LoadDirectory.scala b/src/main/scala/cc/factorie/app/nlp/load/LoadDirectory.scala new file mode 100644 index 0000000..f6531e4 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/load/LoadDirectory.scala @@ -0,0 +1,10 @@ +package cc.factorie.app.nlp.load + +import cc.factorie.app.nlp.Document + +/** The interface common to objects that create Documents from the files in a directory. + * + *@author Andrew McCallum */ +trait LoadDirectory { + def fromDirectory(dir:java.io.File): Seq[Document] +} \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/nlp/load/LoadGermeval2014.scala b/src/main/scala/cc/factorie/app/nlp/load/LoadGermeval2014.scala deleted file mode 100644 index d5be0e4..0000000 --- a/src/main/scala/cc/factorie/app/nlp/load/LoadGermeval2014.scala +++ /dev/null @@ -1,93 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.app.nlp.load - -import cc.factorie.app.nlp._ -import cc.factorie.app.nlp.ner._ -import cc.factorie.util.FastLogging - -import scala.collection.mutable.ArrayBuffer - -/* Loader for Germeval 2014 data - @author Peter Schueller - 1 token ID - 2 word form - 3 gold named entity tag level 1 - 4 gold named entity tag level 2 (nested named entity) - */ - -class LoadGermeval2014 extends Load with FastLogging { - // competition format = BIO - def fromSource(source:io.Source): Seq[Document] = fromSource(source,"BIO") - // alternate format = BILOU - def fromSource(source:io.Source, encoding:String): Seq[Document] = { - def newDocument(name:String): Document = { - var document = new Document("").setName(name) - document.annotators(classOf[Token]) = UnknownDocumentAnnotator.getClass - document.annotators(classOf[Sentence]) = UnknownDocumentAnnotator.getClass - encoding match { - case "BIO" => { - document.annotators(classOf[Lvl1BioGermevalNerTag]) = UnknownDocumentAnnotator.getClass - document.annotators(classOf[Lvl2BioGermevalNerTag]) = UnknownDocumentAnnotator.getClass } - case "BILOU" => { - document.annotators(classOf[Lvl1BilouGermevalNerTag]) = UnknownDocumentAnnotator.getClass - document.annotators(classOf[Lvl2BilouGermevalNerTag]) = UnknownDocumentAnnotator.getClass } - case _ => throw new Error("Germeval2014Load supports only BIO and BILOU encodings") - } - document - } - - val documents = new ArrayBuffer[Document] - var document = newDocument("Germeval2014-"+documents.length) - documents += document - var sentence = new Sentence(document) - val rComment = """#.*""".r - val rEmpty = """\S*""".r - for (line <- source.getLines()) { - line match { - case rComment() => { } // ignore comments - case rEmpty() => { // empty line starts new sentence - // be robust to double empty lines - if (sentence.tokens.size > 0) { - document.appendString("\n") - document.asSection.chainFreeze - document = newDocument("Germeval2014-"+documents.length) - documents += document - sentence = new Sentence(document) - } } - case _ => addToken(document, sentence, line, encoding) - } - } - logger.info("Loaded "+documents.length+" documents with "+documents.map(_.sentences.size).sum+" sentences with "+documents.map(_.tokens.size).sum+" tokens total") - documents - } - - def addToken(document:Document, sentence:Sentence, line:String, encoding:String): Token = { - val fields = line.split("\t") - val word : String = fields(1) - val ner1gold : String = fields(2) - val ner2gold : String = fields(3) - if (sentence.length > 0) document.appendString(" ") - val token = new Token(sentence, word) - encoding match { - case "BIO" => { - token.attr += new LabeledLvl1BioGermevalNerTag(token, ner1gold) - token.attr += new LabeledLvl2BioGermevalNerTag(token, ner2gold) } - case "BILOU" => { - token.attr += new LabeledLvl1BilouGermevalNerTag(token, ner1gold) - token.attr += new LabeledLvl2BilouGermevalNerTag(token, ner2gold) } - } - token - } -} - diff --git a/src/main/scala/cc/factorie/app/nlp/load/LoadHTML.scala b/src/main/scala/cc/factorie/app/nlp/load/LoadHTML.scala deleted file mode 100644 index b4039a6..0000000 --- a/src/main/scala/cc/factorie/app/nlp/load/LoadHTML.scala +++ /dev/null @@ -1,21 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -package cc.factorie.app.nlp.load -import java.io.File - -import cc.factorie.app.nlp.Document - -object LoadHTML { - def fromFile(file:File, segmentSentences:Boolean): Document = throw new Error("Not yet implemented.") -} \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/nlp/load/LoadNYTimesXML.scala b/src/main/scala/cc/factorie/app/nlp/load/LoadNYTimesXML.scala deleted file mode 100644 index 510c845..0000000 --- a/src/main/scala/cc/factorie/app/nlp/load/LoadNYTimesXML.scala +++ /dev/null @@ -1,34 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -package cc.factorie.app.nlp.load -import java.io.File - -import cc.factorie.app.nlp.Document - -import scala.xml._ - -/** Load a Document from a single NYTimes article in the XML format released by NYTimes and described in - Evan Sandhaus (2008), "The New York Times Annotated Corpus," Linguistic Data Consortium, Philadelphia. */ -object LoadNYTimesXML { - def fromFile(file:File): Seq[Document] = { - val article = XML.loadFile(file) - //println(article \\ "head" \\ "title" text) - //println(article \ "head" \ "title" text) - //println(" charcount "+ (article \\ "body" \\ "body.content").text.length) - val content = article \ "head" \ "docdata" \ "identified-content" - //print("Reading ***"+(article\"head"\"title").text+"***") - // TODO This does not include the headline, perhaps it should -akm - LoadPlainText.fromString((article \ "body" \ "body.content").text).map(_.setName(file.getCanonicalPath)) - } -} \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/nlp/load/LoadOWPL.scala b/src/main/scala/cc/factorie/app/nlp/load/LoadOWPL.scala deleted file mode 100644 index 8582ed8..0000000 --- a/src/main/scala/cc/factorie/app/nlp/load/LoadOWPL.scala +++ /dev/null @@ -1,59 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.app.nlp.load - -import cc.factorie.app.nlp.{Document, Sentence, Token, UnknownDocumentAnnotator} -import cc.factorie.variable.MutableCategoricalVar - -/** - * Author: martin, strubell - * Date: 2/25/12, 4/28/14 - * - * On each line, there should be a whitespace-delimited list of the form: - * word label1 label2 label3 ... - * - * Sentences are separated by blank lines. - * Returns all data in a single document. - * - * labelMaker is a function that takes all fields but the first in a line of the file, - * and returns a sequence of MutableCategoricalVars corresponding to the tags that - * any subset of those fields represent - */ - -object LoadOWPL { - def fromFilename(file: String, labelMaker: (Token, Seq[String]) => Seq[MutableCategoricalVar[String]], separator: String = "\\s+", limitSentenceCount: Int = -1): Seq[Document] = { - val doc = new Document - doc.annotators(classOf[Token]) = UnknownDocumentAnnotator.getClass // register that we have token boundaries - doc.annotators(classOf[Sentence]) = UnknownDocumentAnnotator.getClass // register that we have sentence boundaries - var sentence = new Sentence(doc) - var numSentences = 1 - for (line <- io.Source.fromFile(file).getLines()) { - if (line.trim == "") { - sentence = new Sentence(doc) - numSentences += 1 - if (limitSentenceCount > -1 && numSentences > limitSentenceCount) - return Seq(doc) - } - else { - val fields = line.split(separator) - val word = fields(0) - val token = new Token(sentence, word) - labelMaker(token, fields.drop(1)).foreach(token.attr += _) - doc.appendString(" ") - } - } - doc.asSection.chainFreeze - //println("LoadOWPL doc.tokens.length="+doc.tokens.length+" last token: "+doc.tokens.last.string+" "+doc.tokens.last.attr) - Seq(doc) - } -} \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/nlp/load/LoadOntonotes5.scala b/src/main/scala/cc/factorie/app/nlp/load/LoadOntonotes5.scala deleted file mode 100644 index 1dc745d..0000000 --- a/src/main/scala/cc/factorie/app/nlp/load/LoadOntonotes5.scala +++ /dev/null @@ -1,161 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -/* -package cc.factorie.app.nlp.load - -import cc.factorie.app.nlp._ -import cc.factorie.app.nlp.lemma.TokenLemma -import cc.factorie.app.nlp.ner._ -import cc.factorie.app.nlp.parse.ParseTree -import cc.factorie.app.nlp.pos.LabeledPennPosTag - -import scala.io.Source -*/ -/* Loader for the OntoNotes 5 data - @author Brian Martin, Andrew McCallum - 1 token ID - 2 word form - 3 auto lemma - 4 gold lemma - 5 auto POS tag - 6 gold POS tag - 7 auto feats - 8 gold feats - 9 auto head ID - 10 gold head ID - 11 auto dependency label - 12 gold dependency label - 13 gold secondary dependencies - 14 gold semantic arguments - 15 gold named entity tags - 16 gold coreference - - */ -/* -sealed trait AnnotationType -case object DoNotLoad extends AnnotationType -case object GoldLabel extends AnnotationType -case object AutoLabel extends AnnotationType - -object LoadOntonotes5 extends Load { - private def addDepInfo(s: Sentence, depInfoSeq: Seq[(Int,Int,String)]): Unit = { - //assert(depInfoSeq.map(_._1) == Seq.tabulate(depInfoSeq.length)(i => i), "Token indices: "+depInfoSeq.map(_._1).mkString(" ")) // Assert that we have an entry for each token index, in order - val tree = new ParseTree(s, depInfoSeq.map(_._2), depInfoSeq.map(_._3)) - s.attr += tree - } - - def fromSource(source: Source) = - fromSource(source, filename="?UNKNOWN?", loadLemma=GoldLabel, loadPos=GoldLabel, loadParse=GoldLabel, loadNer=true, nerBilou=true) - - def fromSource(source: Source, filename:String, loadLemma:AnnotationType, loadPos:AnnotationType, loadParse:AnnotationType, loadNer:Boolean, nerBilou:Boolean): Seq[Document] = { - val lines = source.getLines() - val document: Document = new Document().setName("Ontonotes499/" + filename) - document.annotators(classOf[Token]) = UnknownDocumentAnnotator.getClass // register that we have token boundaries - document.annotators(classOf[Sentence]) = UnknownDocumentAnnotator.getClass // register that we have sentence boundaries - if (loadPos != DoNotLoad) document.annotators(classOf[pos.PennPosTag]) = UnknownDocumentAnnotator.getClass // register that we have POS tags - if (loadNer) if (nerBilou) document.annotators(classOf[ner.BilouOntonotesNerTag]) = UnknownDocumentAnnotator.getClass else document.annotators(classOf[ner.BioOntonotesNerTag]) = UnknownDocumentAnnotator.getClass - var sentence: Sentence = new Sentence(document) - var depInfoSeq = new collection.mutable.ArrayBuffer[(Int,Int,String)] - for (line <- lines) { - if (line.length < 2) { // Sentence boundary - document.appendString("\n") - addDepInfo(sentence, depInfoSeq) - depInfoSeq = new collection.mutable.ArrayBuffer[(Int,Int,String)] - sentence = null - } else { - if (sentence eq null) - sentence = new Sentence(document) // avoids empty sentence at the end of doc - val fields = line.split('\t') - assert(fields.length >= 10, "Fewer than 10 fields in file "+filename+"\nOffending line:\n"+line) - - val currTokenIdx = fields(0).toInt - 1 - val word = fields(1) - - val autoLemma = fields(2) - val goldLemma = fields(3) - - val autoPartOfSpeech = fields(4) - val goldPartOfSpeech = fields(5) - - // OFF BY 1! - val autoParentIdx = fields(8).toInt - 1 - val goldParentIdx = fields(9).toInt - 1 - - val autoDepLabel = fields(10) - val goldDepLabel = fields(11) - - var ner = fields(14); if (ner == "_") ner = "O" // If we wanted to distinguish "unnamed entities" from background, we wouldn't have this. - - document.appendString(" ") - val token = new Token(sentence, word) - loadPos match { - case GoldLabel => {token.attr += new LabeledPennPosTag(token, if (goldPartOfSpeech == "XX") "PUNC" else goldPartOfSpeech)} - case AutoLabel => {token.attr += new LabeledPennPosTag(token, if (autoPartOfSpeech == "XX") "PUNC" else autoPartOfSpeech)} - case DoNotLoad => {/* do nothing */} - } - loadLemma match { - case GoldLabel => {token.attr += new TokenLemma(token, goldLemma)} - case AutoLabel => {token.attr += new TokenLemma(token, autoLemma)} - case DoNotLoad => {/* do nothing */} - } - loadParse match { - case GoldLabel => {depInfoSeq.append((currTokenIdx, goldParentIdx, goldDepLabel))} - case AutoLabel => {depInfoSeq.append((currTokenIdx, autoParentIdx, autoDepLabel))} - case DoNotLoad => {/* do nothing */} - } - if (loadNer) token.attr += (if (nerBilou) new LabeledBilouOntonotesNerTag(token, ner) else new LabeledBioOntonotesNerTag(token, ner)) - } - } - if ((sentence != null) && (loadParse != DoNotLoad)) addDepInfo(sentence, depInfoSeq) - if (nerBilou) convertBioBilou(document.asSection) - - println("Loaded 1 document with "+document.sentences.size+" sentences with "+document.asSection.length+" tokens total from file "+filename) - Seq(document) - } - - def fromFilename(filename:String): Seq[Document] = { - fromFilename(filename, GoldLabel, GoldLabel, GoldLabel, true, true) - } - - def fromFilename(filename:String, loadLemma:AnnotationType, loadPos:AnnotationType, loadParse:AnnotationType, loadNer:Boolean, nerBilou:Boolean): Seq[Document] = - fromSource(Source.fromFile(filename), filename, loadLemma, loadPos, loadParse, loadNer, nerBilou) - - def convertBioBilou(section:Section): Unit = { - /** Return the string of the NER label, including the two letter (B- or I-) prefix. */ - def cat(token:Token): String = if (token eq null) "null" else token.attr[BilouOntonotesNerTag].categoryValue - /** Return true if the strings are equal without their two letter (B- or I-) prefix. */ - def sim(s1:String, s2:String): Boolean = s1.drop(2) == s2.drop(2) - def isU(cat1:String, cat2:String, cat3:String): Boolean = cat2(0) == 'B' && (!sim(cat2, cat3) || cat3(0) == 'B') - def isB(cat1:String, cat2:String, cat3:String): Boolean = cat2(0) == 'B' && sim(cat2, cat3) && cat3(0) == 'I' - def isL(cat1:String, cat2:String, cat3:String): Boolean = cat2(0) == 'I' && sim(cat1, cat2) && (cat3(0) == 'B' || !sim(cat2, cat3)) - def isI(cat1:String, cat2:String, cat3:String): Boolean = cat2(0) == 'I' && cat3(0) == 'I' - for (token <- section.tokens) if (token.attr[LabeledBilouOntonotesNerTag].intValue != 0) { - val nerLabel = token.attr[LabeledBilouOntonotesNerTag] - val cat1 = cat(token.prev); val cat2 = cat(token); val cat3 = cat(token.next) - if (isU(cat1, cat2, cat3)) nerLabel.target.setCategory("U-"+cat2.drop(2))(null) - else if (isB(cat1, cat2, cat3)) nerLabel.target.setCategory("B-"+cat2.drop(2))(null) - else if (isL(cat1, cat2, cat3)) nerLabel.target.setCategory("L-"+cat2.drop(2))(null) - else if (isI(cat1, cat2, cat3)) nerLabel.target.setCategory("I-"+cat2.drop(2))(null) - nerLabel.setToTarget(null) - } - } - - def printDocument(d: Document) = - for (s <- d.sentences) - println(s.attr[ParseTree].toString() + "\n") - - def main(args: Array[String]) = - for (filename <- args) - printDocument(fromFilename(filename).head) -} -*/ \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/nlp/load/LoadPlainText.scala b/src/main/scala/cc/factorie/app/nlp/load/LoadPlainText.scala deleted file mode 100644 index ba0cb51..0000000 --- a/src/main/scala/cc/factorie/app/nlp/load/LoadPlainText.scala +++ /dev/null @@ -1,53 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -package cc.factorie.app.nlp.load -import java.io.File - -import cc.factorie.app.nlp._ - -import scala.util.matching.Regex - -/** Create Documents from plain text files. - By default create one Document per file. - To create multiple Documents from one file, set documentSeparator regex. - If the regex specifies a group (via parenthesis) then the Document's name will be set to the match of the contents of this first group. */ -class LoadPlainText(documentSeparator:Regex = null) extends Load with LoadDirectory { - def fromSource(source:io.Source): Seq[Document] = { - val string = source.getLines().mkString("\n") - if (documentSeparator eq null) Seq(new Document(string)) - else { - var docStart = 0 - val matchIterator = documentSeparator.findAllIn(string).matchData - (for (sepMatch <- matchIterator if sepMatch.start != docStart) yield { - val doc = new Document(string.substring(docStart, sepMatch.start)) - if (sepMatch.group(1) ne null) doc.setName(sepMatch.group(1)) - docStart = sepMatch.end - doc - }).toIndexedSeq - } - } - def fromDirectory(dir:File): Seq[Document] = (for (file <- files(dir)) yield fromFile(file)).flatten - private def files(directory:File): Seq[File] = { - if (!directory.exists) throw new Error("File "+directory+" does not exist") - if (directory.isFile) return List(directory) - val result = new scala.collection.mutable.ArrayBuffer[File] - for (entry <- directory.listFiles) { - if (entry.isFile) result += entry - else if (entry.isDirectory) result ++= files(entry) - } - result - } -} - -object LoadPlainText extends LoadPlainText(documentSeparator = null) diff --git a/src/main/scala/cc/factorie/app/nlp/load/LoadReACE.scala b/src/main/scala/cc/factorie/app/nlp/load/LoadReACE.scala deleted file mode 100644 index a0bc83c..0000000 --- a/src/main/scala/cc/factorie/app/nlp/load/LoadReACE.scala +++ /dev/null @@ -1,208 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -/* -package cc.factorie.app.nlp.load -import java.io.File - -import cc.factorie.app.nlp.{Document, Sentence, Token, UnknownDocumentAnnotator, _} -import cc.factorie.app.nlp.coref.Mention -import cc.factorie.app.nlp.ner.ConllNerSpan -import cc.factorie.app.nlp.phrase.Phrase -import cc.factorie.app.nlp.pos.PennPosTag -import cc.factorie.variable.SetVariable - -import scala.Array.fallbackCanBuildFrom -import scala.xml.{NodeSeq, XML} -*/ -/* -trait ReACEMentionIdentifiers { - val mId: Option[String] - val eId: Option[String] - val headStart: Option[Int] - val headEnd: Option[Int] - val headLength: Option[Int] - val mType: String - val mSubType: String -} - -trait ReACERelationIdentifiers { - val rId: Option[String] - val rType: Option[String] - val rSubtype: Option[String] -} - -trait ReACESentenceAnnotations { - val paragraphId: Option[String] - val sentenceId: Option[String] -} - -trait ReACEWordAnnotations { - val lemma: Option[String] - val pos: Option[String] - val chunk: Option[String] - val nounHead: Option[String] - val verbStem: Option[String] - val verbHead: Option[String] - val verbVoice: Option[String] - val verbNeg: Option[String] -} - -class ReACESentenceId(val sentId: String) - -object LoadReACE { - - private def getAttr(ns: NodeSeq, key: String): Option[String] = { - val fullKey: String = "@" + key - val v = (ns \ fullKey).text - if (v == "") None - else Some(v) - } - - private def makeTokenAnnotations(wordXml: NodeSeq): ReACEWordAnnotations = { - val a: String => Option[String] = getAttr(wordXml, _) - new ReACEWordAnnotations { - val lemma: Option[String] = a("l") - val pos: Option[String] = a("p") - val chunk: Option[String] = a("phr") - val nounHead: Option[String] = a("headn") - val verbStem: Option[String] = a("vstem") - val verbHead: Option[String] = a("headv") - val verbVoice: Option[String] = a("voice") - val verbNeg: Option[String] = a("neg") - } - } - - private def makeDoc(xml: String): Document = { - val doc = new Document().setName(xml) - doc.annotators(classOf[Token]) = UnknownDocumentAnnotator.getClass - doc.annotators(classOf[Sentence]) = UnknownDocumentAnnotator.getClass - doc.annotators(classOf[PennPosTag]) = UnknownDocumentAnnotator.getClass - - doc.attr += new ACEFileIdentifier(xml) - val xmlText: NodeSeq = XML.loadFile(xml + ".ttt.xml") - - var currP = 0 - for (p <- xmlText \\ "p") { - currP += 1 - for (s <- p \\ "s") { - val sId = getAttr(s, "id") - val sent = new Sentence(doc) - sent.attr += new ReACESentenceAnnotations { - val paragraphId = Some(currP.toString) - val sentenceId = sId - } - for (w <- s \\ "w") { - val t = new Token(sent, w.text) - doc.appendString(" ") - val annotations = makeTokenAnnotations(w) - t.attr += annotations // TODO I think these annotations should go in more standard FACTORIE NLP form -akm - annotations.pos.foreach(p => t.attr += new PennPosTag(t, p)) - } - } - } - doc - } - - private def lookupEntityMention(doc: Document, id: String): Option[Mention] = { - val opt = doc.attr[ner.ConllNerSpanBuffer].find { - s => { - val a = s.attr[ReACEMentionIdentifiers] - (a ne null) && a.mId.get == id - } - } - if (opt == None) None - else Some(opt.get.asInstanceOf[Mention]) - } - - def addNrm(doc: Document, xml: String): Document = { - val coref = doc.getCoref - var xmlText: NodeSeq = XML.loadFile(xml + ".nrm.xml") - assert(doc.attr[ACEFileIdentifier].fileId == xml) // adding to the right document? - - // Add mentions - for (mention <- xmlText \\ "ne") { - // named-entity mentions - // phrase span - val start = (mention \ "@fr").text.drop(1).toInt - 1 - val end = (mention \ "@to").text.drop(1).toInt - 1 - val length = end - start + 1 - - // head span - val hstart = (mention \ "@hfr").text.drop(1).toInt - 1 - val hend = (mention \ "@hto").text.drop(1).toInt - 1 - val hlength = hend - hstart + 1 - - // ner type - val nerType = (mention \ "@t").text - val nerSubType = (mention \ "@st").text - - val phrase = new Phrase(doc.asSection,start,length,hend) - phrase.attr += new ConllNerSpan(doc.asSection,start,length,nerType) - - val m = coref.addMention(phrase) - - m.attr += new ReACEMentionIdentifiers { - val mId = getAttr(mention, "id") - val eId = getAttr(mention, "gid") - val headStart = Some(hstart) - val headEnd = Some(hend) - val headLength = Some(hlength) - val mType = nerType - val mSubType = nerSubType - } - } - - // Add relations - xmlText = XML.loadFile(xml + ".nrm.xml") // is there a way to avoid rereading? -// doc.attr += new RelationMentions -// for (rel <- xmlText \\ "rel") { -// val ids = new ReACERelationIdentifiers { -// val rId = getAttr(rel, "id") -// val rType = getAttr(rel, "t") -// val rSubtype = getAttr(rel, "st") -// } -// -// val e1 = lookupEntityMention(doc, getAttr(rel, "e1").get).get -// val e2 = lookupEntityMention(doc, getAttr(rel, "e2").get).get -// val args = Seq(e1, e2) -// -// val m = new RelationMention(e1, e2, ids.rType.get, Some(ids.rSubtype.get)) -// m.attr += ids -// doc.attr[RelationMentions].add(m)(null) -// args.foreach(_.attr.getOrElseUpdate(new RelationMentions).add(m)(null)) -// } - - doc - } - - class MentionsSet extends SetVariable[Mention] - // TODO: consider renaming this to fromFile to match the API for other loaders. - // But if renamed, how can the user know that ttt.xml is required? - def fromTtt(ttt: String): Document = { - val fileStr = ttt.dropRight(8) - val doc = makeDoc(fileStr) - addNrm(doc, fileStr) - doc - } - - def fromDirectory(dir: String, takeOnly: Int = Int.MaxValue): Seq[Document] = - new File(dir).listFiles().filter(_.getName.endsWith(".ttt.xml")).take(takeOnly).map(f => fromTtt(f.getAbsolutePath)) - - def main(args: Array[String]): Unit = { - val docs = fromDirectory(args(0)) - for (d <- docs) - d.attr[ner.ConllNerSpanBuffer].foreach(s => println(s)) - } - -} -*/ \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/nlp/load/LoadWSJMalt.scala b/src/main/scala/cc/factorie/app/nlp/load/LoadWSJMalt.scala deleted file mode 100644 index 6d27741..0000000 --- a/src/main/scala/cc/factorie/app/nlp/load/LoadWSJMalt.scala +++ /dev/null @@ -1,155 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -/* -package cc.factorie.app.nlp.load - -import cc.factorie.app.nlp._ -import cc.factorie.app.nlp.lemma.TokenLemma -import cc.factorie.app.nlp.ner._ -import cc.factorie.app.nlp.parse.ParseTree -import cc.factorie.app.nlp.pos.LabeledPennPosTag - -import scala.io.Source -*/ -/* Loader for the OntoNotes 5 data - @author Brian Martin, Andrew McCallum - 1 token ID - 2 word form - 3 gold lemma - 4 auto lemma - 5 gold POS tag - 6 auto POS tag - 7 gold feats - 8 auto feats - 9 auto head ID - 10 gold head ID - 11 auto dependency label - 12 gold dependency label - 13 gold secondary dependencies - 14 gold semantic arguments - 15 gold named entity tags - 16 gold coreference - */ -/* -object LoadWSJMalt extends Load { - private def addDepInfo(s: Sentence, depInfoSeq: Seq[(Int,Int,String)]): Unit = { - //assert(depInfoSeq.map(_._1) == Seq.tabulate(depInfoSeq.length)(i => i), "Token indices: "+depInfoSeq.map(_._1).mkString(" ")) // Assert that we have an entry for each token index, in order - val tree = new ParseTree(s, depInfoSeq.map(_._2), depInfoSeq.map(_._3)) - s.attr += tree - } - - def fromSource(source: Source): Seq[Document] = - fromSource(source, filename="?UNKNOWN?", loadLemma=GoldLabel, loadPos=GoldLabel, loadParse=GoldLabel, loadNer=true, nerBilou=true) - - def fromSource(source: Source, filename:String, loadLemma:AnnotationType, loadPos:AnnotationType = GoldLabel, loadParse:AnnotationType = GoldLabel, loadNer:Boolean = true, nerBilou:Boolean = false): Seq[Document] = { - val lines = source.getLines() - val document: Document = new Document().setName("Ontonotes499/" + filename) - document.annotators(classOf[Token]) = UnknownDocumentAnnotator.getClass // register that we have token boundaries - document.annotators(classOf[Sentence]) = UnknownDocumentAnnotator.getClass // register that we have sentence boundaries - if (loadPos != DoNotLoad) document.annotators(classOf[pos.PennPosTag]) = UnknownDocumentAnnotator.getClass // register that we have POS tags - if (loadNer) if (nerBilou) document.annotators(classOf[ner.BilouOntonotesNerTag]) = UnknownDocumentAnnotator.getClass else document.annotators(classOf[ner.BioOntonotesNerTag]) = UnknownDocumentAnnotator.getClass - var sentence: Sentence = new Sentence(document) - var depInfoSeq = new collection.mutable.ArrayBuffer[(Int,Int,String)] - for (line <- lines) { - if (line.length < 2) { // Sentence boundary - document.appendString("\n") - addDepInfo(sentence, depInfoSeq) - depInfoSeq = new collection.mutable.ArrayBuffer[(Int,Int,String)] - sentence = null - } else { - if (sentence eq null) - sentence = new Sentence(document) // avoids empty sentence at the end of doc - val fields = line.split('\t') - assert(fields.length >= 10, "Fewer than 10 fields in file "+filename+"\nOffending line:\n"+line) - - val currTokenIdx = fields(0).toInt - 1 - val word = fields(1) - - val goldLemma = fields(2) - val autoLemma = fields(3) - - val goldPartOfSpeech = fields(4) - val autoPartOfSpeech = fields(5) - - // OFF BY 1! - val autoParentIdx = fields(7).toInt - 1 - val goldParentIdx = fields(8).toInt - 1 - - val autoDepLabel = fields(9) - val goldDepLabel = fields(10) - - var ner = fields(13); if (ner == "_") ner = "O" // If we wanted to distinguish "unnamed entities" from background, we wouldn't have this. - - document.appendString(" ") - val token = new Token(sentence, word) - loadPos match { - case GoldLabel => {token.attr += new LabeledPennPosTag(token, if (goldPartOfSpeech == "XX") "PUNC" else goldPartOfSpeech)} - case AutoLabel => {token.attr += new LabeledPennPosTag(token, if (autoPartOfSpeech == "XX") "PUNC" else autoPartOfSpeech)} - case DoNotLoad => {/* do nothing */} - } - loadLemma match { - case GoldLabel => {token.attr += new TokenLemma(token, goldLemma)} - case AutoLabel => {token.attr += new TokenLemma(token, autoLemma)} - case DoNotLoad => {/* do nothing */} - } - loadParse match { - case GoldLabel => {depInfoSeq.append((currTokenIdx, goldParentIdx, goldDepLabel))} - case AutoLabel => {depInfoSeq.append((currTokenIdx, autoParentIdx, autoDepLabel))} - case DoNotLoad => {/* do nothing */} - } - if (loadNer) token.attr += (if (nerBilou) new LabeledBilouOntonotesNerTag(token, ner) else new LabeledBioOntonotesNerTag(token, ner)) - } - } - if ((sentence != null) && (loadParse != DoNotLoad)) addDepInfo(sentence, depInfoSeq) - if (nerBilou) convertBioBilou(document.asSection) - - println("Loaded 1 document with "+document.sentences.size+" sentences with "+document.asSection.length+" tokens total from file "+filename) - Seq(document) - } - - def fromFilename(filename: String): Seq[Document] = - fromFilename(filename, loadLemma=GoldLabel, loadPos=GoldLabel, loadParse=GoldLabel, loadNer=true, nerBilou=true) - - def fromFilename(filename:String, loadLemma:AnnotationType, loadPos:AnnotationType, loadParse:AnnotationType, loadNer:Boolean, nerBilou:Boolean): Seq[Document] = - fromSource(Source.fromFile(filename), filename, loadLemma, loadPos, loadParse, loadNer, nerBilou) - - def convertBioBilou(section:Section): Unit = { - /** Return the string of the NER label, including the two letter (B- or I-) prefix. */ - def cat(token:Token): String = if (token eq null) "null" else token.attr[BilouOntonotesNerTag].categoryValue - /** Return true if the strings are equal without their two letter (B- or I-) prefix. */ - def sim(s1:String, s2:String): Boolean = s1.drop(2) == s2.drop(2) - def isU(cat1:String, cat2:String, cat3:String): Boolean = cat2(0) == 'B' && (!sim(cat2, cat3) || cat3(0) == 'B') - def isB(cat1:String, cat2:String, cat3:String): Boolean = cat2(0) == 'B' && sim(cat2, cat3) && cat3(0) == 'I' - def isL(cat1:String, cat2:String, cat3:String): Boolean = cat2(0) == 'I' && sim(cat1, cat2) && (cat3(0) == 'B' || !sim(cat2, cat3)) - def isI(cat1:String, cat2:String, cat3:String): Boolean = cat2(0) == 'I' && cat3(0) == 'I' - for (token <- section.tokens) if (token.attr[LabeledBilouOntonotesNerTag].intValue != 0) { - val nerLabel = token.attr[LabeledBilouOntonotesNerTag] - val cat1 = cat(token.prev); val cat2 = cat(token); val cat3 = cat(token.next) - if (isU(cat1, cat2, cat3)) nerLabel.target.setCategory("U-"+cat2.drop(2))(null) - else if (isB(cat1, cat2, cat3)) nerLabel.target.setCategory("B-"+cat2.drop(2))(null) - else if (isL(cat1, cat2, cat3)) nerLabel.target.setCategory("L-"+cat2.drop(2))(null) - else if (isI(cat1, cat2, cat3)) nerLabel.target.setCategory("I-"+cat2.drop(2))(null) - nerLabel.setToTarget(null) - } - } - - def printDocument(d: Document) = - for (s <- d.sentences) - println(s.attr[ParseTree].toString() + "\n") - - def main(args: Array[String]) = - for (filename <- args) - printDocument(fromFilename(filename).head) - -} -*/ \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/nlp/load/LoadWikipediaPlainText.scala b/src/main/scala/cc/factorie/app/nlp/load/LoadWikipediaPlainText.scala deleted file mode 100644 index ef1e88e..0000000 --- a/src/main/scala/cc/factorie/app/nlp/load/LoadWikipediaPlainText.scala +++ /dev/null @@ -1,165 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -/* -package cc.factorie.app.nlp.load -import java.io.{File, _} - -import cc.factorie.app.nlp._ -import org.apache.commons.compress.compressors.CompressorStreamFactory -*/ -/** Create Documents from plain text files. - By default create one Document per file. - To create multiple Documents from one file, set documentSeparator regex. - If the regex specifies a group (via parenthesis) then the Document's name will be set to the match of the contents of this first group. */ -/* -class LoadWikipediaPlainText { - /** This assumes that the file has format of enwiki-latest-pages-articles.xml.bz2. */ - def fromCompressedFilename(filename: String, maxArticleCount: Long): Iterator[Document] = { - //require(filename.startsWith("enwiki") && filename.endsWith(".xml.bz2")) - val inputStream = new CompressorStreamFactory().createCompressorInputStream(CompressorStreamFactory.BZIP2, new FileInputStream(filename)) - fromInputStream(inputStream, maxArticleCount) - } - /** This assumes that the file has format of enwiki-latest-pages-articles.xml.bz2. */ - def fromCompressedFile(file: File, maxArticleCount: Long): Iterator[Document] = { - val inputStream = new CompressorStreamFactory().createCompressorInputStream(CompressorStreamFactory.BZIP2, new FileInputStream(file)) - fromInputStream(inputStream, maxArticleCount) - } - - // An io.Source version of this just keeps growing until it runs out of memory, as if io.Source were keeping the entire contents in memory. - // So we make an InputStream version of this method. - - def fromInputStream(input: InputStream, maxArticleCount: Long): Iterator[Document] = { - new Iterator[Document] { - val bufferedReader = new BufferedReader(new InputStreamReader(input)) - var bufferedReaderDone = false - private var articleCount = 0 - private var nextDocument = getNextDocument - - // Keep getting next document until we get a non-null one or the bufferedReader is done - private def getNextDocument: Document = { - var result = getNextDocument1 - while ((result eq null) && !bufferedReaderDone) - result = getNextDocument - result - } - // Try to fetch one document, but if there is no text in this article, return null - // TODO: remove talk pages! -luke - private def getNextDocument1: Document = { - val nonBracket = "[^\\[\\]]*" - val cleaningRegex = ("(?s)" + (List( // Make "." match also match newline - "<!--(?:.(?!-->))+.-->", // Remove comments - "<ref>(?:.(?!</ref>))*.</ref>", // Remove everything inside and - "<math>(?:.(?!</math>))*.</math>", // Remove everything inside and - "<code>(?:.(?!</code>))*.</code>", // Remove everything inside and - "<gallery(?:.(?!</gallery>))*.</gallery>", // Remove everything inside and - "<(?:.(?!>))*.>", //Remove everything between < and > - "&(?:[a-z]{2,6};)+", // Remove " and solo > (meaning > symbol) and &nbsp; and all other similar patterns - "Category:", - "#REDIRECT", - "^\\s+", // Remove leading whitespace - "\\s+$" // Remove trailing whitespace - ).mkString("|"))).r - - var sb = new StringBuffer(2048*16) - var docDone = false - var title: String = null - var insideText = false - var line: String = null - - while ({ line = bufferedReader.readLine(); (line ne null) && !docDone }) { - //println(articleCount.toString+" Line>>> "+line) - if (!insideText) { - val titleIndex = line.indexOf("") - if (titleIndex >= 0) { - val titleEndIndex = line.indexOf("") - title = line.substring(titleIndex+7, titleEndIndex) - //println(title) - } else if (line.contains("")) { - insideText = true; sb append line.substring(line.lastIndexOf('>')+1) - } - } else { - if (line.contains("")) { insideText = false; docDone = true; sb append line.substring(0, line.indexOf('<')) } - else { sb append line; sb append '\n' } - } - } - if (line eq null) { input.close(); bufferedReaderDone = true } - sb = removeNestedBrackets(sb) - val text = cleaningRegex.replaceAllIn(sb, " ") - if (text.length == 0) return null - articleCount += 1 - new Document(text).setName(title) - } - def hasNext: Boolean = articleCount < maxArticleCount && (nextDocument ne null) - def next(): Document = { - val result = nextDocument - nextDocument = getNextDocument - result - } - - private def removeNestedBrackets(s:StringBuffer): StringBuffer = { - val sb = new StringBuffer(s.length) - var sb2 = new StringBuffer(1024) - var curlyOpenCount = 0 - var squareOpenCount = 0 - var i = 0; val len = s.length - while (i < len) { - val c = s.codePointAt(i).toChar - if (c == '{') { curlyOpenCount += 1 /*; sb.append("{"+curlyOpenCount)*/ } - else if (c == '[') { squareOpenCount += 1 /*; sb.append("["+squareOpenCount)*/ } - else if (c == '}' && curlyOpenCount > 0) { curlyOpenCount -= 1 ; sb.append(' ') /* Why it this "append" necessary (otherwise large chunks of text missing before "In Aristotle's terminology..."); sb.append("}"+curlyOpenCount)*/ } // include (openCount > 0) because sometimes a }} will appear inside a comment. - else if (c == ']' && squareOpenCount > 0) { - squareOpenCount -= 1 - //sb.append("]"+squareOpenCount) - if (squareOpenCount == 0) { - // Handling [[wikt:anarchism|anarchism]] and [[Taoism|Taoist]] and [[File:WilliamGodwin.jpg|left|thumb|[[William Godwin]] and [[wiktionary:anthropology|anthropology]] and [[w:Charles Lyell|Charles Lyell's]] - val s2 = sb2.toString - val colonIndex = s2.indexOf(':') - val barIndex = s2.indexOf('|') - if (colonIndex >= 1) { - //val prefix = s2.substring(0, 3); if (prefix == "wik") { if (barIndex > 0) sb.append(s2.substring(colonIndex+1, barIndex)) else sb.append(s2.substring(colonIndex+1)) } - if (s2(0) == 'w') { if (barIndex > 0) sb.append(s2.substring(barIndex+1)) else sb.append(s2.substring(colonIndex+1)) } - } else if (colonIndex == -1) { - if (barIndex > 0) sb.append(s2.substring(barIndex+1)) - //if (barIndex > 0) sb.append(s2.substring(0, barIndex)) // Note: this appends the Wikipedia title, whereas the Wikipedia page text would show the part after the '|', not before. -akm - else sb.append(s2) - } - //if (!s2.contains(':')) sb append s2 - sb2 = new StringBuffer(1024) - } - } else if (curlyOpenCount == 0) { - if (squareOpenCount == 0) sb append c - else if (squareOpenCount == 2) sb2 append c - } - i += 1 - } - sb - } - - } - } - -} - -/** A simple command-line runnable Wikipedia text extractor. - Usage: LoadWikipediaPlainText 1000 enwiki-latest-pages-articles.xml.bz2 - will print to stdout the first 1000 non-empty Wikipedia articles. */ -object LoadWikipediaPlainText extends LoadWikipediaPlainText { - def main(args:Array[String]): Unit = { - val docs = fromCompressedFilename(args(1), args(0).toInt) - for (doc <- docs) { - println(doc.string) - println("\n+++++++++++++++++++++++++++++++++++++++++\n\n") - } - } -} -*/ \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/nlp/load/TacFileIterator.scala b/src/main/scala/cc/factorie/app/nlp/load/TacFileIterator.scala deleted file mode 100644 index 4fa889d..0000000 --- a/src/main/scala/cc/factorie/app/nlp/load/TacFileIterator.scala +++ /dev/null @@ -1,128 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -/* -package cc.factorie.app.nlp.load - -import java.io._ -import java.util.Scanner -import java.util.zip.GZIPInputStream - -import cc.factorie.app.nlp.Document - -object TACDocTypes { - sealed trait TACDocumentType - case object Newswire extends TACDocumentType - case object DiscussionForum extends TACDocumentType - case object WebDocument extends TACDocumentType - - object TACDocumentType { - def fromFilePath(f:File):TACDocumentType = { - val path = f.getAbsolutePath.toLowerCase - if(path.contains("discussion_forums")) { - DiscussionForum - } else if(path.contains("newswire")) { - Newswire - } else if(path.contains("web")) { - WebDocument - } else { - throw new Exception("Unable to assign document at path %s to a document type".format(path)) - } - } - } -} - - -/** - * @author John Sullivan - */ -class TacFileIterator(tacDocFile:File) extends Iterator[Document] { - import TACDocTypes._ - - private val docEndString = """""" - private val webDocStartString = """""" - private val docIdRegex = """(?i)]*>""".r - private val webDocIdRegex = """(?i) ([^ ]+) """.r - - /** we use scanner here so that when we recreate the lines by adding \n we don't change - * the character count on documents that may use crlf to delimit lines - */ - private val tacReader = new Scanner(if(tacDocFile.getName.endsWith(".gz")) { - new GZIPInputStream(new FileInputStream(tacDocFile)) - } else { - new FileInputStream(tacDocFile) - }).useDelimiter("\n") - - private var docBuffer = new StringBuilder() - private var line = null.asInstanceOf[String] - private var lineNum = 0 - - // grouping together to avoid forgetting something - @inline - private def advanceLine() { - docBuffer append line - docBuffer append "\n" - line = if(tacReader.hasNext) tacReader.next() else null - lineNum += 1 - } - - //priming the pump - we don't call advanceLine because we don't want to add a null to the start of our doc - line = if(tacReader.hasNext) tacReader.next() else null - lineNum += 1 - - def next() = { - - val docIdMatchOpt = docIdRegex.unapplySeq(line).map(_.head) - - // We should be at the start of a new document here, otherwise we have a problem. - assert(line.equalsIgnoreCase(webDocStartString) || docIdMatchOpt.isDefined, "Found line: |%s| that was not a valid doc start at line %d in %s".format(line, lineNum, tacDocFile.getName)) - val docId = if(docIdMatchOpt.isDefined) { - docIdRegex.unapplySeq(line).get.head - //var docIdRegex(docId) = line - } else if(line equalsIgnoreCase webDocStartString) { // we know that one must be true but let's not tempt fate - advanceLine() - //var webDocIdRegex(docId) = line - webDocIdRegex.unapplySeq(line).get.head - } else { - throw new Exception("Found line: |%s| that was not a valid doc start at line %d in %s".format(line, lineNum, tacDocFile.getName)) - } - - while(!line.equalsIgnoreCase(docEndString)) { - advanceLine() - } - // the loop exits when the doc end is found, but that us still part of the previous document so we need to consume it. - advanceLine() - val docString = docBuffer.toString() - docBuffer = new StringBuilder() - val doc = new Document(docString).setName(docId) - doc.attr += TACDocumentType.fromFilePath(tacDocFile) - doc.annotators += classOf[TACDocumentType] -> this.getClass - doc - } - - def hasNext = line != null -} - -object TacFileIterator { - def main(args:Array[String]) { - val f = new File(args(0)) - - val doc = new TacFileIterator(f).next() - println(doc.name) - val wrt = new BufferedWriter(new FileWriter(doc.name)) - wrt.write(doc.string) - wrt.flush() - wrt.close() - - } -} -*/ \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/nlp/load/XMLSectionalizer.scala b/src/main/scala/cc/factorie/app/nlp/load/XMLSectionalizer.scala deleted file mode 100644 index a011478..0000000 --- a/src/main/scala/cc/factorie/app/nlp/load/XMLSectionalizer.scala +++ /dev/null @@ -1,124 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -/* -package cc.factorie.app.nlp.load - -import cc.factorie.app.nlp.load.TACDocTypes._ -import cc.factorie.app.nlp.{Document, DocumentAnnotator, Section, Token} - -import scala.collection.mutable - -/** The token span is assumed to be contiguous */ -abstract class TACSection(tks:Iterable[Token]) extends Section { - val document:Document = tks.head.document - val stringStart:Int = tks.head.stringStart - val stringEnd:Int = tks.last.stringEnd - // this needs to go after the definition of document because of the wonky way - // that token.document and section.document interact. - tks foreach this.+= -} -class UsableText(tokens:Iterable[Token]) extends TACSection(tokens) -class UnusableText(tokens:Iterable[Token]) extends TACSection(tokens) - -/** A document annotator that creates [[UsableText]] sections for texts within boundaryToken that - * are not within excludeTokens. Everything else goes in [[UnusableText]] sections. */ -class XMLSectionalizer(boundaryToken:String, excludeTokens:Set[String]) extends DocumentAnnotator { - sealed trait State - case object Usable extends State - case object Unusable extends State - - val acceptedOpenTag = ("""(?i)< *(""" + boundaryToken + """)[^\n>]*?>""").r - val acceptedCloseTag = ("""(?i)""").r - - val excludedOpenTag = ("""(?i)< *(""" + excludeTokens.mkString("|") + """)[^\n>]*?>""").r - val excludedCloseTag = ("""(?i)""").r - - def tokenAnnotationString(token: Token) = null - - val prereqAttrs = Seq(classOf[Token]) - val postAttrs = Seq(classOf[TACSection]) - - - def process(document:Document) = { - val tagStack = mutable.Stack[String]() - val stateStack = mutable.Stack[State]() - stateStack push Unusable - val sectionBuffer = mutable.ArrayBuffer[TACSection]() - val tokenBuffer = mutable.ArrayBuffer[Token]() - document.tokens.foreach { t => - (t.string, stateStack.top) match { - case (acceptedOpenTag(tag), Unusable) => - tokenBuffer += t - tagStack push tag.asInstanceOf[String] - sectionBuffer += new UnusableText(tokenBuffer) - tokenBuffer.clear() - stateStack push Usable - case (acceptedCloseTag(tag), Usable) if tagStack.headOption == Some(tag.asInstanceOf[String]) => - tagStack.pop() - if(tokenBuffer.nonEmpty) { - sectionBuffer += new UsableText(tokenBuffer) - tokenBuffer.clear() - } - stateStack.pop() - tokenBuffer += t - case (excludedOpenTag(tag), Usable) => - if(tokenBuffer.nonEmpty) { - sectionBuffer += new UsableText(tokenBuffer) - tokenBuffer.clear() - } - tokenBuffer += t - stateStack push Unusable - case (excludedOpenTag(tag), Unusable) => - tokenBuffer += t - stateStack push Unusable - case (excludedCloseTag(tag), Unusable) if tagStack.headOption == Some(tag.asInstanceOf[String]) => - tagStack.pop() - sectionBuffer += new UnusableText(tokenBuffer) - tokenBuffer.clear() - stateStack.pop() - case (acceptedCloseTag(tag), Unusable) => - // we are in this state because we found an excluded open tag without a corresponding close tag. - // In that event we just read in everything as usable text - if(tokenBuffer.nonEmpty) { - sectionBuffer += new UsableText(tokenBuffer) - tokenBuffer.clear() - } - tokenBuffer += t - stateStack.pop() - case _ => - tokenBuffer += t - } - } - document.clearSections() - sectionBuffer foreach document.+= - document - } -} - -object WebTextSectionalizer extends XMLSectionalizer("post", Set("postdate", "poster", "quote")) -object ForumPostSectionalizer extends XMLSectionalizer("post", Set("quote")) -object NewswireSectionalizer extends XMLSectionalizer("text", Set.empty[String]) - -object TACSectionalizer extends DocumentAnnotator { - def tokenAnnotationString(token: Token) = null - - val prereqAttrs = Seq(classOf[Token], classOf[TACDocumentType]) - val postAttrs = Seq(classOf[TACSection]) - - def process(document: Document) = (document.attr[TACDocumentType] match { - case Newswire => NewswireSectionalizer - case DiscussionForum => ForumPostSectionalizer - case WebDocument => WebTextSectionalizer - }).process(document) -} -*/ \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/nlp/morph/BasicMorphologicalAnalyzer.scala b/src/main/scala/cc/factorie/app/nlp/morph/BasicMorphologicalAnalyzer.scala index 39d8868..183fb1c 100644 --- a/src/main/scala/cc/factorie/app/nlp/morph/BasicMorphologicalAnalyzer.scala +++ b/src/main/scala/cc/factorie/app/nlp/morph/BasicMorphologicalAnalyzer.scala @@ -16,6 +16,8 @@ import cc.factorie.app.nlp.lexicon.Lexicon import cc.factorie.util.ClasspathURL import scala.collection.mutable +import scala.io + /** A simple morphological analyzer, simply indicating if a noun is singular or plural. Obviously this supports very limited functionality. More will be added as needed. diff --git a/src/main/scala/cc/factorie/app/nlp/ner/BILOU.scala b/src/main/scala/cc/factorie/app/nlp/ner/BILOU.scala new file mode 100644 index 0000000..4a3b350 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/ner/BILOU.scala @@ -0,0 +1,48 @@ +package cc.factorie.app.nlp.ner + +import cc.factorie.variable.CategoricalDomain + +/** BILOU span encoding (Beginning, Inside, Last, Outside, Unit) */ +trait BILOU extends SpanEncoding { + this : CategoricalDomain[String] => + def prefixes = Set("B-", "I-", "L-", "U-") + def isLicitTransition(from: String, to: String) = BILOU.licitTransitions contains from -> to + + def splitNerTag(tag:String):(String, Option[String]) = if(tag == "O") "O" -> None else { + val Array(pre, cat) = tag.split("-") + if(pre == "U") { + pre -> None + } else { + pre -> Some(cat) + } + } + + def isLicit(from: this.type#Value, to: this.type#Value) = + splitNerTag(from.category) -> splitNerTag(to.category) match { + case ((fromPre, Some(fromCat)), (toPre, Some(toCat))) => toCat == fromCat && BILOU.licitTransitions.contains(fromPre -> toPre) + case ((fromPre, _), (toPre, _)) => BILOU.licitTransitions contains fromPre -> toPre + } + +} + +object BILOU { + val licitTransitions = Set( + "O" -> "B", + "O" -> "U", + "O" -> "O", + + "B" -> "I", + "B" -> "L", + + "I" -> "I", + "I" -> "L", + + "L" -> "O", + "L" -> "B", + "L" -> "U", + + "U" -> "U", + "U" -> "B", + "U" -> "O" + ) +} \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/nlp/ner/BilouConllNerChunkAnnotator.scala b/src/main/scala/cc/factorie/app/nlp/ner/BilouConllNerChunkAnnotator.scala new file mode 100644 index 0000000..e307443 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/ner/BilouConllNerChunkAnnotator.scala @@ -0,0 +1,5 @@ +package cc.factorie.app.nlp.ner + +import cc.factorie.app.nlp.Section + +object BilouConllNerChunkAnnotator extends NerChunkAnnotator[ConllNerSpan, BilouConllNerTag]({() => new ConllNerSpanBuffer}, {(s:Section, start:Int, end:Int, cat:String) => new ConllNerSpan(s, start, end, cat)}) diff --git a/src/main/scala/cc/factorie/app/nlp/ner/BilouConllNerDomain.scala b/src/main/scala/cc/factorie/app/nlp/ner/BilouConllNerDomain.scala new file mode 100644 index 0000000..79928c0 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/ner/BilouConllNerDomain.scala @@ -0,0 +1,13 @@ +package cc.factorie.app.nlp.ner + +import cc.factorie.app.nlp.{Section, _} +import cc.factorie.variable.CategoricalDomain + +object BilouConllNerDomain extends CategoricalDomain[String] with BILOU { + this ++= encodedTags(ConllNerDomain.categories) + freeze() + def spanList(section:Section): ConllNerSpanBuffer = { + val boundaries = bilouBoundaries(section.tokens.map(_.attr[BilouConllNerTag].categoryValue)) + new ConllNerSpanBuffer ++= boundaries.map(b => new ConllNerSpan(section, b._1, b._2, b._3)) + } +} diff --git a/src/main/scala/cc/factorie/app/nlp/ner/BilouConllNerTag.scala b/src/main/scala/cc/factorie/app/nlp/ner/BilouConllNerTag.scala new file mode 100644 index 0000000..5702253 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/ner/BilouConllNerTag.scala @@ -0,0 +1,7 @@ +package cc.factorie.app.nlp.ner + +import cc.factorie.app.nlp.Token + +class BilouConllNerTag(token:Token, initialCategory:String) extends NerTag(token, initialCategory) with Serializable { + def domain = BilouConllNerDomain +} diff --git a/src/main/scala/cc/factorie/app/nlp/ner/BilouOntonotesNerChunkAnnotator.scala b/src/main/scala/cc/factorie/app/nlp/ner/BilouOntonotesNerChunkAnnotator.scala new file mode 100644 index 0000000..838aa14 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/ner/BilouOntonotesNerChunkAnnotator.scala @@ -0,0 +1,5 @@ +package cc.factorie.app.nlp.ner + +import cc.factorie.app.nlp.Section + +object BilouOntonotesNerChunkAnnotator extends NerChunkAnnotator[OntonotesNerSpan, BilouOntonotesNerTag]({() => new OntonotesNerSpanBuffer}, {(s:Section, start:Int, end:Int, cat:String) => new OntonotesNerSpan(s, start, end, cat)}) \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/nlp/ner/BilouOntonotesNerDomain.scala b/src/main/scala/cc/factorie/app/nlp/ner/BilouOntonotesNerDomain.scala new file mode 100644 index 0000000..c646e7a --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/ner/BilouOntonotesNerDomain.scala @@ -0,0 +1,15 @@ +package cc.factorie.app.nlp.ner + +import cc.factorie.app.nlp.{Section, _} +import cc.factorie.variable.CategoricalDomain + +object BilouOntonotesNerDomain extends CategoricalDomain[String] with BILOU { + this ++= encodedTags(OntonotesNerDomain.categories) + freeze() + // Convert from an intValue in this domain to an intValue in the OntonotesNerDomain + def bilouSuffixIntValue(bilouIntValue:Int): Int = if (bilouIntValue == 0) 0 else ((bilouIntValue - 1) / 4) + 1 + def spanList(section:Section): OntonotesNerSpanBuffer = { + val boundaries = bilouBoundaries(section.tokens.map(_.attr[BilouOntonotesNerTag].categoryValue)) + new OntonotesNerSpanBuffer ++= boundaries.map(b => new OntonotesNerSpan(section, b._1, b._2, b._3)) + } +} diff --git a/src/main/scala/cc/factorie/app/nlp/ner/BilouOntonotesNerTag.scala b/src/main/scala/cc/factorie/app/nlp/ner/BilouOntonotesNerTag.scala new file mode 100644 index 0000000..8150e81 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/ner/BilouOntonotesNerTag.scala @@ -0,0 +1,6 @@ +package cc.factorie.app.nlp.ner + +import cc.factorie.app.nlp.Token + +class BilouOntonotesNerTag(token:Token, initialCategory:String) + extends NerTag(token, initialCategory) with Serializable { def domain = BilouOntonotesNerDomain } \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/nlp/ner/ChainNer.scala b/src/main/scala/cc/factorie/app/nlp/ner/ChainNer.scala index 3b64e7e..d170279 100644 --- a/src/main/scala/cc/factorie/app/nlp/ner/ChainNer.scala +++ b/src/main/scala/cc/factorie/app/nlp/ner/ChainNer.scala @@ -14,66 +14,17 @@ package cc.factorie.app.nlp.ner import java.io._ -import cc.factorie.app.nlp.lexicon.{LexiconsProvider, StaticLexicons} - import cc.factorie._ import cc.factorie.app.chain.{ChainModel, SegmentEvaluation} -import cc.factorie.app.nlp._ +import cc.factorie.app.nlp.{Document, DocumentAnnotator, Sentence, Token} import cc.factorie.optimize.{AdaGrad, ParameterAveraging, Trainer} import cc.factorie.util._ import cc.factorie.variable._ -import scala.reflect.{ClassTag, classTag} - -import cc.factorie.app.nlp.lemma.LowercaseLemmatizer - -/** - * NER tagger for the CoNLL 2003 corpus - * - * Training time: ~3 minutes (on blake, 30 Oct. 4:00pm) - * tokens per second: 8431.02310444517 - * docs per second: 48.24287793720109 (avg doc length = 200 tokens) - * - * CoNLL 2003 dev set (eng.testa) - * OVERALL f1=0.933593 p=0.939802 r=0.927465 (tp=5511 fp=353 fn=431 true=5942 pred=5864) acc=0.985865 (50636/51362) - * LOC f1=0.965931 p=0.967249 r=0.964616 (tp=1772 fp=60 fn=65 true=1837 pred=1832) - * MISC f1=0.876404 p=0.909091 r=0.845987 (tp=780 fp=78 fn=142 true=922 pred=858) - * ORG f1=0.892065 p=0.899848 r=0.884415 (tp=1186 fp=132 fn=155 true=1341 pred=1318) - * PER f1=0.958897 p=0.955280 r=0.962541 (tp=1773 fp=83 fn=69 true=1842 pred=1856) - * - * CoNLL 2003 test set (eng.testb) - * OVERALL f1=0.885633 p=0.888315 r=0.882967 (tp=4987 fp=627 fn=661 true=5648 pred=5614) acc=0.973253 (45193/46435) - * LOC f1=0.915375 p=0.909953 r=0.920863 (tp=1536 fp=152 fn=132 true=1668 pred=1688) - * MISC f1=0.791034 p=0.803231 r=0.779202 (tp=547 fp=134 fn=155 true=702 pred=681) - * ORG f1=0.842767 p=0.838498 r=0.847080 (tp=1407 fp=271 fn=254 true=1661 pred=1678) - * PER f1=0.940327 p=0.955329 r=0.925788 (tp=1497 fp=70 fn=120 true=1617 pred=1567) - * - */ -class ConllChainNer(implicit mp:ModelProvider[ConllChainNer], nerLexiconFeatures:NerLexiconFeatures) - extends ChainNer[BilouConllNerTag]( - BilouConllNerDomain, - (t, s) => new BilouConllNerTag(t, s), - l => l.token, - mp.provide, - nerLexiconFeatures) with Serializable { - def loadDocs(fileName: String): Seq[Document] = cc.factorie.app.nlp.load.LoadConll2003(BILOU=true).fromFilename(fileName) - - def newSpan(sec: Section, start: Int, length: Int, category: String) = new ConllNerSpan(sec, start, length, category) - - def newBuffer = new ConllNerSpanBuffer -} - -//TODO this serialized model doesn't exist yet? -object ConllChainNer extends ConllChainNer()(ModelProvider.classpath(), StaticLexiconFeatures()) with Serializable +import scala.reflect.ClassTag -class OntonotesChainNer()(implicit mp:ModelProvider[OntonotesChainNer], nerLexiconFeatures:NerLexiconFeatures) - extends ChainNer[BilouOntonotesNerTag](BilouOntonotesNerDomain, (t, s) => new BilouOntonotesNerTag(t, s), l => l.token, mp.provide, nerLexiconFeatures) { - def newBuffer = new OntonotesNerSpanBuffer() - def newSpan(sec: Section, start: Int, length: Int, category: String) = new OntonotesNerSpan(sec, start, length, category) -} -object OntonotesChainNer extends OntonotesChainNer()(ModelProvider.classpath(), StaticLexiconFeatures()) /** * A base class for finite-state named entity recognizers @@ -310,7 +261,7 @@ abstract class ChainNer[L<:NerTag](val labelDomain: CategoricalDomain[String] wi segmentEvaluation.f1 } } - +/* class ChainNerOpts extends cc.factorie.util.CmdOptions with SharedNLPCmdOptions with ModelProviderCmdOptions with DefaultCmdOptions { val saveModel = new CmdOption("save-model", "CoNLLChainNer.factorie", "FILE", "Filename for the model (saving a trained model or reading a running model.") val serialize = new CmdOption("serialize", true, "BOOLEAN", "Whether to serialize at all") @@ -323,17 +274,17 @@ class ChainNerOpts extends cc.factorie.util.CmdOptions with SharedNLPCmdOptions val delta = new CmdOption("delta", 0.066, "DOUBLE", "learning delta") val modelFile = new CmdOption("model-file", "", "STRING", "Filename of the serialized model that you want to load.") val useTagger = new CmdOption("use-tagger", "", "STRING", "Which tagger? (remove me later)") - val lexicons = new LexiconsProviderCmdOption("lexicons") + val lexicon = new LexiconsProviderCmdOption("lexicon") val lang = new CmdOption("language", "en", "STRING", "Lexicons language.") } - +*/ /* object ConllChainNerTrainer extends cc.factorie.util.HyperparameterMain { def evaluateParameters(args:Array[String]): Double = { val opts = new ChainNerOpts implicit val random = new scala.util.Random(0) opts.parse(args) - val ner = new ConllChainNer()(ModelProvider.empty, new StaticLexiconFeatures(new StaticLexicons()(opts.lexicons.value), opts.lang.value)) + val ner = new ConllChainNer()(ModelProvider.empty, new StaticLexiconFeatures(new StaticLexicons()(opts.lexicon.value), opts.lang.value)) if (opts.brownClusFile.wasInvoked) { println(s"Reading brown cluster file: ${opts.brownClusFile.value}") for (line <- scala.io.Source.fromFile(opts.brownClusFile.value).getLines()) { diff --git a/src/main/scala/cc/factorie/app/nlp/ner/ConllChainNer.scala b/src/main/scala/cc/factorie/app/nlp/ner/ConllChainNer.scala new file mode 100644 index 0000000..f3ce49b --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/ner/ConllChainNer.scala @@ -0,0 +1,47 @@ +package cc.factorie.app.nlp.ner + +import java.io.Serializable + +import cc.factorie.app.nlp.load.LoadConll2003 +import cc.factorie.app.nlp.{Document, Section} +import cc.factorie.util.ModelProvider + + +/** + * NER tagger for the CoNLL 2003 corpus + * + * Training time: ~3 minutes (on blake, 30 Oct. 4:00pm) + * tokens per second: 8431.02310444517 + * docs per second: 48.24287793720109 (avg doc length = 200 tokens) + * + * CoNLL 2003 dev set (eng.testa) + * OVERALL f1=0.933593 p=0.939802 r=0.927465 (tp=5511 fp=353 fn=431 true=5942 pred=5864) acc=0.985865 (50636/51362) + * LOC f1=0.965931 p=0.967249 r=0.964616 (tp=1772 fp=60 fn=65 true=1837 pred=1832) + * MISC f1=0.876404 p=0.909091 r=0.845987 (tp=780 fp=78 fn=142 true=922 pred=858) + * ORG f1=0.892065 p=0.899848 r=0.884415 (tp=1186 fp=132 fn=155 true=1341 pred=1318) + * PER f1=0.958897 p=0.955280 r=0.962541 (tp=1773 fp=83 fn=69 true=1842 pred=1856) + * + * CoNLL 2003 test set (eng.testb) + * OVERALL f1=0.885633 p=0.888315 r=0.882967 (tp=4987 fp=627 fn=661 true=5648 pred=5614) acc=0.973253 (45193/46435) + * LOC f1=0.915375 p=0.909953 r=0.920863 (tp=1536 fp=152 fn=132 true=1668 pred=1688) + * MISC f1=0.791034 p=0.803231 r=0.779202 (tp=547 fp=134 fn=155 true=702 pred=681) + * ORG f1=0.842767 p=0.838498 r=0.847080 (tp=1407 fp=271 fn=254 true=1661 pred=1678) + * PER f1=0.940327 p=0.955329 r=0.925788 (tp=1497 fp=70 fn=120 true=1617 pred=1567) + * + */ +class ConllChainNer(implicit mp:ModelProvider[ConllChainNer], nerLexiconFeatures:NerLexiconFeatures) + extends ChainNer[BilouConllNerTag]( + BilouConllNerDomain, + (t, s) => new BilouConllNerTag(t, s), + l => l.token, + mp.provide, + nerLexiconFeatures) with Serializable { + def loadDocs(fileName: String): Seq[Document] = LoadConll2003(BILOU=true).fromFilename(fileName) + + def newSpan(sec: Section, start: Int, length: Int, category: String) = new ConllNerSpan(sec, start, length, category) + + def newBuffer = new ConllNerSpanBuffer +} + +//TODO this serialized model doesn't exist yet? +object ConllChainNer extends ConllChainNer()(ModelProvider.classpath(), StaticLexiconFeatures()) with Serializable diff --git a/src/main/scala/cc/factorie/app/nlp/ner/ConllNerDomain.scala b/src/main/scala/cc/factorie/app/nlp/ner/ConllNerDomain.scala new file mode 100644 index 0000000..de41296 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/ner/ConllNerDomain.scala @@ -0,0 +1,8 @@ +package cc.factorie.app.nlp.ner + +import cc.factorie.variable.EnumDomain + +object ConllNerDomain extends EnumDomain { + val O, PER, ORG, LOC, MISC = Value + freeze() +} diff --git a/src/main/scala/cc/factorie/app/nlp/ner/ConllNerLabel.scala b/src/main/scala/cc/factorie/app/nlp/ner/ConllNerLabel.scala new file mode 100644 index 0000000..4e0f181 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/ner/ConllNerLabel.scala @@ -0,0 +1,4 @@ +package cc.factorie.app.nlp.ner + + +//class ConllNerLabel(val token:Token, targetValue:String) extends NerLabel(targetValue) { def domain = ConllNerDomain } diff --git a/src/main/scala/cc/factorie/app/nlp/ner/ConllNerSpan.scala b/src/main/scala/cc/factorie/app/nlp/ner/ConllNerSpan.scala new file mode 100644 index 0000000..4cceba7 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/ner/ConllNerSpan.scala @@ -0,0 +1,5 @@ +package cc.factorie.app.nlp.ner + +import cc.factorie.app.nlp.Section + +class ConllNerSpan(section:Section, start:Int, length:Int, category:String) extends NerSpan(section, start, length) with Serializable { val label = new ConllNerSpanLabel(this, category) } diff --git a/src/main/scala/cc/factorie/app/nlp/ner/ConllNerSpanBuffer.scala b/src/main/scala/cc/factorie/app/nlp/ner/ConllNerSpanBuffer.scala new file mode 100644 index 0000000..8243161 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/ner/ConllNerSpanBuffer.scala @@ -0,0 +1,7 @@ +package cc.factorie.app.nlp.ner + +/** + * Created by andrew@andrewresearch.net on 28/10/17. + */ + +class ConllNerSpanBuffer extends NerSpanBuffer[ConllNerSpan] with Serializable diff --git a/src/main/scala/cc/factorie/app/nlp/ner/ConllNerSpanLabel.scala b/src/main/scala/cc/factorie/app/nlp/ner/ConllNerSpanLabel.scala new file mode 100644 index 0000000..4563815 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/ner/ConllNerSpanLabel.scala @@ -0,0 +1,6 @@ +package cc.factorie.app.nlp.ner + +import cc.factorie.app.nlp.TokenSpan + +class ConllNerSpanLabel(span:TokenSpan, initialCategory:String) extends NerSpanLabel(span, initialCategory) with Serializable { def domain = ConllNerDomain } + diff --git a/src/main/scala/cc/factorie/app/nlp/ner/ConllNerTag.scala b/src/main/scala/cc/factorie/app/nlp/ner/ConllNerTag.scala new file mode 100644 index 0000000..44fc50a --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/ner/ConllNerTag.scala @@ -0,0 +1,5 @@ +package cc.factorie.app.nlp.ner + +import cc.factorie.app.nlp.Token + +class ConllNerTag(token:Token, initialCategory:String) extends NerTag(token, initialCategory) { def domain = ConllNerDomain } \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/nlp/ner/ConllStackedChainNer.scala b/src/main/scala/cc/factorie/app/nlp/ner/ConllStackedChainNer.scala new file mode 100644 index 0000000..7bf0921 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/ner/ConllStackedChainNer.scala @@ -0,0 +1,18 @@ +package cc.factorie.app.nlp.ner + +import cc.factorie.app.nlp.embeddings.SkipGramEmbedding +import cc.factorie.util.ModelProvider + +class ConllStackedChainNer(embeddingMap: SkipGramEmbedding, + embeddingDim: Int, + scale: Double, + useOffsetEmbedding: Boolean)(implicit mp:ModelProvider[ConllStackedChainNer], nerLexiconFeatures:NerLexiconFeatures) + extends StackedChainNer[BilouConllNerTag]( + BilouConllNerDomain, + (t, s) => new BilouConllNerTag(t, s), + l => l.token, + embeddingMap, + embeddingDim, + scale, + useOffsetEmbedding, + mp.provide, nerLexiconFeatures) diff --git a/src/main/scala/cc/factorie/app/nlp/ner/LabeledBilouNerTag.scala b/src/main/scala/cc/factorie/app/nlp/ner/LabeledBilouNerTag.scala new file mode 100644 index 0000000..5ebfb94 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/ner/LabeledBilouNerTag.scala @@ -0,0 +1,7 @@ +package cc.factorie.app.nlp.ner + +import cc.factorie.app.nlp.Token +import cc.factorie.variable.CategoricalLabeling + +class LabeledBilouConllNerTag(token:Token, initialCategory:String) + extends BilouConllNerTag(token, initialCategory) with CategoricalLabeling[String] with Serializable diff --git a/src/main/scala/cc/factorie/app/nlp/ner/LabeledBilouOntonotesNerTag.scala b/src/main/scala/cc/factorie/app/nlp/ner/LabeledBilouOntonotesNerTag.scala new file mode 100644 index 0000000..36b548d --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/ner/LabeledBilouOntonotesNerTag.scala @@ -0,0 +1,7 @@ +package cc.factorie.app.nlp.ner + +import cc.factorie.app.nlp.Token +import cc.factorie.variable.CategoricalLabeling + +class LabeledBilouOntonotesNerTag(token:Token, initialCategory:String) + extends BilouOntonotesNerTag(token, initialCategory) with CategoricalLabeling[String] with Serializable diff --git a/src/main/scala/cc/factorie/app/nlp/ner/LabeledConllNerTag.scala b/src/main/scala/cc/factorie/app/nlp/ner/LabeledConllNerTag.scala new file mode 100644 index 0000000..3ec335e --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/ner/LabeledConllNerTag.scala @@ -0,0 +1,6 @@ +package cc.factorie.app.nlp.ner + +import cc.factorie.app.nlp.Token +import cc.factorie.variable.CategoricalLabeling + +class LabeledConllNerTag(token:Token, initialCategory:String) extends ConllNerTag(token, initialCategory) with CategoricalLabeling[String] diff --git a/src/main/scala/cc/factorie/app/nlp/ner/LabeledOntonotesNerTag.scala b/src/main/scala/cc/factorie/app/nlp/ner/LabeledOntonotesNerTag.scala new file mode 100644 index 0000000..871a1fd --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/ner/LabeledOntonotesNerTag.scala @@ -0,0 +1,7 @@ +package cc.factorie.app.nlp.ner + +import cc.factorie.app.nlp.Token +import cc.factorie.variable.CategoricalLabeling + +class LabeledOntonotesNerTag(token:Token, initialCategory:String) + extends OntonotesNerTag(token, initialCategory) with CategoricalLabeling[String] \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/nlp/ner/NERChunkAnnotator.scala b/src/main/scala/cc/factorie/app/nlp/ner/NERChunkAnnotator.scala index de3e575..7ca4590 100644 --- a/src/main/scala/cc/factorie/app/nlp/ner/NERChunkAnnotator.scala +++ b/src/main/scala/cc/factorie/app/nlp/ner/NERChunkAnnotator.scala @@ -1,30 +1,11 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ package cc.factorie.app.nlp.ner -import cc.factorie.app.nlp.{Section, Token, Document, DocumentAnnotator} -import scala.reflect.{ClassTag, classTag} -import scala.collection.mutable +import java.util.logging.{Level, Logger} -import java.util.logging.{Logger, Level} +import cc.factorie.app.nlp.{Document, DocumentAnnotator, Section, Token} -/** - * @author John Sullivan - */ -object BilouConllNerChunkAnnotator extends NerChunkAnnotator[ConllNerSpan, BilouConllNerTag]({() => new ConllNerSpanBuffer}, {(s:Section, start:Int, end:Int, cat:String) => new ConllNerSpan(s, start, end, cat)}) -object BilouOntonotesNerChunkAnnotator extends NerChunkAnnotator[OntonotesNerSpan, BilouOntonotesNerTag]({() => new OntonotesNerSpanBuffer}, {(s:Section, start:Int, end:Int, cat:String) => new OntonotesNerSpan(s, start, end, cat)}) -object BioConllNerChunkAnnotator extends NerChunkAnnotator[ConllNerSpan, BioConllNerTag]({() => new ConllNerSpanBuffer}, {(s:Section, start:Int, end:Int, cat:String) => new ConllNerSpan(s, start, end, cat)}) -object BioOntonotesNerChunkAnnotator extends NerChunkAnnotator[OntonotesNerSpan, BioOntonotesNerTag]({() => new OntonotesNerSpanBuffer}, {(s:Section, start:Int, end:Int, cat:String) => new OntonotesNerSpan(s, start, end, cat)}) +import scala.collection.mutable +import scala.reflect.{ClassTag, classTag} /** Takes documents that are already annotated with token-level NerTags of type Tag and annotates them with NerSpans * of type Span */ @@ -85,5 +66,5 @@ class NerChunkAnnotator[Span <: NerSpan : ClassTag, Tag <: NerTag : ClassTag](ne } object NerChunkAnnotator { - private val logger : Logger = Logger.getLogger(getClass.getName) + private val logger : Logger = Logger.getLogger(getClass.getName) } diff --git a/src/main/scala/cc/factorie/app/nlp/ner/NerSpan.scala b/src/main/scala/cc/factorie/app/nlp/ner/NerSpan.scala new file mode 100644 index 0000000..02bcd2a --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/ner/NerSpan.scala @@ -0,0 +1,14 @@ +package cc.factorie.app.nlp.ner + +import cc.factorie.app.nlp.{Section, TokenSpan} + +/** + * Created by andrew@andrewresearch.net on 28/10/17. + */ + +/** A TokenSpan covering a named entity. Its entity type is indicated by its "label" member. + @author Andrew McCallum */ +abstract class NerSpan(section:Section, start:Int, length:Int) extends TokenSpan(section, start, length) { + def label: NerSpanLabel + override def toString = "NerSpan("+length+","+label.categoryValue+":"+this.string+")" +} diff --git a/src/main/scala/cc/factorie/app/nlp/ner/NerSpanBuffer.scala b/src/main/scala/cc/factorie/app/nlp/ner/NerSpanBuffer.scala new file mode 100644 index 0000000..ec11a5e --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/ner/NerSpanBuffer.scala @@ -0,0 +1,6 @@ +package cc.factorie.app.nlp.ner + +import cc.factorie.app.nlp.TokenSpanBuffer + +trait NerSpanBuffer[Tag <: NerSpan] extends TokenSpanBuffer[Tag] + diff --git a/src/main/scala/cc/factorie/app/nlp/ner/NerSpanLabel.scala b/src/main/scala/cc/factorie/app/nlp/ner/NerSpanLabel.scala new file mode 100644 index 0000000..ddadeb1 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/ner/NerSpanLabel.scala @@ -0,0 +1,10 @@ +package cc.factorie.app.nlp.ner + +import cc.factorie.app.nlp.TokenSpan +import cc.factorie.variable.CategoricalVariable + +/** A categorical variable holding the named entity type of a TokenSpan. + * More specific subclasses have a domain, such as ConllNerDomain. + * + * @author Andrew McCallum */ +abstract class NerSpanLabel(val span:TokenSpan, initialCategory:String) extends CategoricalVariable(initialCategory) diff --git a/src/main/scala/cc/factorie/app/nlp/ner/NerTag.scala b/src/main/scala/cc/factorie/app/nlp/ner/NerTag.scala index e5efc4d..bb29f1f 100644 --- a/src/main/scala/cc/factorie/app/nlp/ner/NerTag.scala +++ b/src/main/scala/cc/factorie/app/nlp/ner/NerTag.scala @@ -1,327 +1,17 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - package cc.factorie.app.nlp.ner -import cc.factorie.app.nlp._ -import cc.factorie.la.{SparseBinaryTensor2, Tensor2} -import cc.factorie.variable._ -import scala.collection.JavaConverters._ +import cc.factorie.app.nlp.Token +import cc.factorie.variable.CategoricalVariable // A "Tag" is a categorical label associated with a token. - /** An abstract class for a variable holding the part-of-speech tag of a Token. +/** An abstract class for a variable holding the part-of-speech tag of a Token. More specific subclasses have a domain, such as BilouConllNerDomain. @author Andrew McCallum */ abstract class NerTag(val token:Token, initialCategory:String) extends CategoricalVariable(initialCategory) { - /** Return "PER" instead of "I-PER". */ - def baseCategoryValue: String = if (categoryValue.length > 1 && categoryValue(1) == '-') categoryValue.substring(2) else categoryValue - - def isEmpty = categoryValue == "O" // this should always be correct, but it might not be - def spanPrefix = categoryValue.split("-").apply(0) - } - -trait NerSpanBuffer[Tag <: NerSpan] extends TokenSpanBuffer[Tag] - -/** A categorical variable holding the named entity type of a TokenSpan. - More specific subclasses have a domain, such as ConllNerDomain. - @author Andrew McCallum */ -abstract class NerSpanLabel(val span:TokenSpan, initialCategory:String) extends CategoricalVariable(initialCategory) -/** A TokenSpan covering a named entity. Its entity type is indicated by its "label" member. - @author Andrew McCallum */ -abstract class NerSpan(section:Section, start:Int, length:Int) extends TokenSpan(section, start, length) { - def label: NerSpanLabel - override def toString = "NerSpan("+length+","+label.categoryValue+":"+this.string+")" -} -// Note: There are currently no labeled counterparts to these SpanLabels. - -/** Base trait for label span encodings like BILOU and BIO - * @author Kate Silverstein - */ -trait SpanEncoding { - this: CategoricalDomain[String] => - def prefixes: Set[String] - def encodedTags(baseTags: Seq[String]): Seq[String] = Seq("O") ++ baseTags.filter(_ != "O").flatMap(t => prefixes.map(_ + t)) - def suffixIntVal(i: Int): Int = if (i == 0) 0 else ((i - 1)/prefixes.size)+1 - def isLicitTransition(from:String, to:String):Boolean - - def isLicit(from:this.type#Value, to:this.type#Value):Boolean - - final def permittedMask:Set[(Int, Int)] = - (for(t1 <- this._indices.values().asScala; // todo there has to be a better way to get this - t2 <- this._indices.values().asScala - if isLicit(t1, t2)) yield { - //println(s"${t1.category} -> ${t2.category}") - t1.intValue -> t2.intValue }).toSet -} -object BILOU { - val licitTransitions = Set( - "O" -> "B", - "O" -> "U", - "O" -> "O", - - "B" -> "I", - "B" -> "L", - - "I" -> "I", - "I" -> "L", - - "L" -> "O", - "L" -> "B", - "L" -> "U", - - "U" -> "U", - "U" -> "B", - "U" -> "O" - ) -} - -object BIO { - val licitTransitions = Set( - "B" -> "I", - "B" -> "O", - "B" -> "B", - - "O" -> "B", - "O" -> "O", - - "I" -> "I", - "I" -> "O", - "I" -> "B" - ) -} - -/** BILOU span encoding (Beginning, Inside, Last, Outside, Unit) */ -trait BILOU extends SpanEncoding { - this : CategoricalDomain[String] => - def prefixes = Set("B-", "I-", "L-", "U-") - def isLicitTransition(from: String, to: String) = BILOU.licitTransitions contains from -> to - - def splitNerTag(tag:String):(String, Option[String]) = if(tag == "O") "O" -> None else { - val Array(pre, cat) = tag.split("-") - if(pre == "U") { - pre -> None - } else { - pre -> Some(cat) - } - } - - def isLicit(from: this.type#Value, to: this.type#Value) = - splitNerTag(from.category) -> splitNerTag(to.category) match { - case ((fromPre, Some(fromCat)), (toPre, Some(toCat))) => toCat == fromCat && BILOU.licitTransitions.contains(fromPre -> toPre) - case ((fromPre, _), (toPre, _)) => BILOU.licitTransitions contains fromPre -> toPre - } - -} -/** BIO span encoding (Beginning, Inside, Outside) */ -trait BIO extends SpanEncoding { - this : CategoricalDomain[String] => - def prefixes = Set("B-", "I-") - def isLicitTransition(from: String, to: String) = BILOU.licitTransitions contains from -> to - def splitNerTag(tag:String):(String, Option[String]) = if(tag == "O") "O" -> None else { - val Array(pre, cat) = tag.split("-") - pre -> Some(cat) - } - - def isLicit(from: this.type#Value, to: this.type#Value) = - splitNerTag(from.category) -> splitNerTag(to.category) match { - case ((fromPre, Some(fromCat)), (toPre, Some(toCat))) => toCat == fromCat && BIO.licitTransitions.contains(fromPre -> toPre) - case ((fromPre, _), (toPre, _)) => BIO.licitTransitions contains fromPre -> toPre - } -} - -object ConllNerDomain extends EnumDomain { - val O, PER, ORG, LOC, MISC = Value - freeze() -} -class ConllNerTag(token:Token, initialCategory:String) extends NerTag(token, initialCategory) { def domain = ConllNerDomain } -class LabeledConllNerTag(token:Token, initialCategory:String) extends ConllNerTag(token, initialCategory) with CategoricalLabeling[String] - -class ConllNerSpanLabel(span:TokenSpan, initialCategory:String) extends NerSpanLabel(span, initialCategory) with Serializable { def domain = ConllNerDomain } -class ConllNerSpan(section:Section, start:Int, length:Int, category:String) extends NerSpan(section, start, length) with Serializable { val label = new ConllNerSpanLabel(this, category) } -class ConllNerSpanBuffer extends NerSpanBuffer[ConllNerSpan] with Serializable -//class ConllNerLabel(val token:Token, targetValue:String) extends NerLabel(targetValue) { def domain = ConllNerDomain } - - -object BioConllNerDomain extends CategoricalDomain[String] with BIO { - this ++= encodedTags(ConllNerDomain.categories) - freeze() - //val B_PER = index("B-PER") - //val I_PER = index("I-PER") - // TODO add more of these index vals - def spanList(section:Section): ConllNerSpanBuffer = { - val boundaries = iobBoundaries(section.tokens.map(_.attr[BioConllNerTag].categoryValue)) - new ConllNerSpanBuffer ++= boundaries.map(b => new ConllNerSpan(section, b._1, b._2, b._3)) - } -} -class BioConllNerTag(token:Token, initialCategory:String) extends NerTag(token, initialCategory) with Serializable { def domain = BioConllNerDomain } -class LabeledBioConllNerTag(token:Token, initialCategory:String) extends BioConllNerTag(token, initialCategory) with CategoricalLabeling[String] with Serializable -//class BioConllNerLabel(val token:Token, targetValue:String) extends NerLabel(targetValue) { def domain = BioConllNerDomain } - - -object BilouConllNerDomain extends CategoricalDomain[String] with BILOU { - this ++= encodedTags(ConllNerDomain.categories) - freeze() - def spanList(section:Section): ConllNerSpanBuffer = { - val boundaries = bilouBoundaries(section.tokens.map(_.attr[BilouConllNerTag].categoryValue)) - new ConllNerSpanBuffer ++= boundaries.map(b => new ConllNerSpan(section, b._1, b._2, b._3)) - } -} -class BilouConllNerTag(token:Token, initialCategory:String) extends NerTag(token, initialCategory) with Serializable { - def domain = BilouConllNerDomain -} -class LabeledBilouConllNerTag(token:Token, initialCategory:String) - extends BilouConllNerTag(token, initialCategory) with CategoricalLabeling[String] with Serializable -//class BilouConllNerLabel(val token:Token, targetValue:String) extends NerLabel(targetValue) { def domain = BilouConllNerDomain } - - -object OntonotesNerDomain extends EnumDomain { - val O, - CARDINAL, - DATE, - EVENT, - FAC, - GPE, - LANGUAGE, - LAW, - LOC, - MONEY, - NORP, - ORDINAL, - ORG, - PERCENT, - PERSON, - PRODUCT, - QUANTITY, - TIME, - WORK_OF_ART = Value - freeze() -} - -/** Entity types used in coreference. - @author Andrew McCallum */ -object OntonotesEntityTypeDomain extends EnumDomain { - val O, - CARDINAL, - DATE, - EVENT, - FAC, - GPE, - LANGUAGE, - LAW, - LOC, - MONEY, - NORP, - ORDINAL, - ORG, - PERCENT, - PERSON, - PRODUCT, - QUANTITY, - TIME, - WORK_OF_ART, - MISC = Value - freeze() -} -// OntonotesEntityType is defined in cc.factorie.app.nlp.phrase - -class OntonotesNerTag(token:Token, initialCategory:String) extends NerTag(token, initialCategory) { - def domain = OntonotesNerDomain -} -class LabeledOntonotesNerTag(token:Token, initialCategory:String) - extends OntonotesNerTag(token, initialCategory) with CategoricalLabeling[String] - -class OntonotesNerSpanLabel(span:TokenSpan, initialCategory:String) extends NerSpanLabel(span, initialCategory) with Serializable { def domain = OntonotesNerDomain } -class OntonotesNerSpan(section:Section, start:Int, length:Int, category:String) extends NerSpan(section, start, length) with Serializable { val label = new OntonotesNerSpanLabel(this, category) } -class OntonotesNerSpanBuffer(spans:Iterable[OntonotesNerSpan]) extends NerSpanBuffer[OntonotesNerSpan] with Serializable { - def this() = this(Iterable.empty[OntonotesNerSpan]) -} - - -object BioOntonotesNerDomain extends CategoricalDomain[String] with BIO { - this ++= encodedTags(OntonotesNerDomain.categories) - freeze() -} -class BioOntonotesNerTag(token:Token, initialCategory:String) - extends NerTag(token, initialCategory) with Serializable { def domain = BioOntonotesNerDomain } -class LabeledBioOntonotesNerTag(token:Token, initialCategory:String) - extends BioOntonotesNerTag(token, initialCategory) with CategoricalLabeling[String] with Serializable -class IobOntonotesNerTag(token:Token, initialCategory:String) - extends NerTag(token, initialCategory) with Serializable { def domain = BioOntonotesNerDomain } -class LabeledIobOntonotesNerTag(token:Token, initialCategory:String) - extends IobOntonotesNerTag(token, initialCategory) with CategoricalLabeling[String] with Serializable -//class BioOntonotesNerLabel(val token:Token, targetValue:String) extends NerLabel(targetValue) { def domain = BioOntonotesNerDomain } - -object BilouOntonotesNerDomain extends CategoricalDomain[String] with BILOU { - this ++= encodedTags(OntonotesNerDomain.categories) - freeze() - // Convert from an intValue in this domain to an intValue in the OntonotesNerDomain - def bilouSuffixIntValue(bilouIntValue:Int): Int = if (bilouIntValue == 0) 0 else ((bilouIntValue - 1) / 4) + 1 - def spanList(section:Section): OntonotesNerSpanBuffer = { - val boundaries = bilouBoundaries(section.tokens.map(_.attr[BilouOntonotesNerTag].categoryValue)) - new OntonotesNerSpanBuffer ++= boundaries.map(b => new OntonotesNerSpan(section, b._1, b._2, b._3)) - } -} -class BilouOntonotesNerTag(token:Token, initialCategory:String) - extends NerTag(token, initialCategory) with Serializable { def domain = BilouOntonotesNerDomain } -class LabeledBilouOntonotesNerTag(token:Token, initialCategory:String) - extends BilouOntonotesNerTag(token, initialCategory) with CategoricalLabeling[String] with Serializable - -object GermevalNerDomain extends CategoricalDomain[String] { - this ++= Vector( - "O", - "OTH", "OTHpart", "OTHderiv", - "ORG", "ORGpart", "ORGderiv", - "LOC", "LOCpart", "LOCderiv", - "PER", "PERpart", "PERderiv" - ) - freeze() -} -class GermevalNerTag(token:Token, initialCategory:String) extends NerTag(token, initialCategory) { def domain = GermevalNerDomain } -class LabeledGermevalNerTag(token:Token, initialCategory:String) extends GermevalNerTag(token, initialCategory) with CategoricalLabeling[String] - -class GermevalNerSpanLabel(span:TokenSpan, initialCategory:String) extends NerSpanLabel(span, initialCategory) { def domain = GermevalNerDomain } -class GermevalNerSpan(section:Section, start:Int, length:Int, category:String) extends NerSpan(section, start, length) { val label = new GermevalNerSpanLabel(this, category) } -class GermevalNerSpanBuffer extends NerSpanBuffer[GermevalNerSpan] - - -object BioGermevalNerDomain extends CategoricalDomain[String] with BIO { - this ++= encodedTags(GermevalNerDomain.categories) - freeze() -} - -// tags for both levels of NER annotation -class Lvl1BioGermevalNerTag(token:Token, initialCategory:String) extends NerTag(token, initialCategory) { def domain = BioGermevalNerDomain } -class LabeledLvl1BioGermevalNerTag(token:Token, initialCategory:String) extends Lvl1BioGermevalNerTag(token, initialCategory) with CategoricalLabeling[String] -class Lvl2BioGermevalNerTag(token:Token, initialCategory:String) extends NerTag(token, initialCategory) { def domain = BioGermevalNerDomain } -class LabeledLvl2BioGermevalNerTag(token:Token, initialCategory:String) extends Lvl2BioGermevalNerTag(token, initialCategory) with CategoricalLabeling[String] - -object BilouGermevalNerDomain extends CategoricalDomain[String] with BILOU { - this ++= encodedTags(GermevalNerDomain.categories) - freeze() - def lvl1SpanList(section:Section): GermevalNerSpanBuffer = { - val boundaries = bilouBoundaries(section.tokens.map(_.attr[Lvl1BilouGermevalNerTag].categoryValue)) - new GermevalNerSpanBuffer ++= boundaries.map(b => new GermevalNerSpan(section, b._1, b._2, b._3)) - } - def lvl2SpanList(section:Section): GermevalNerSpanBuffer = { - val boundaries = bilouBoundaries(section.tokens.map(_.attr[Lvl2BilouGermevalNerTag].categoryValue)) - new GermevalNerSpanBuffer ++= boundaries.map(b => new GermevalNerSpan(section, b._1, b._2, b._3)) - } -} - -// tags for both levels of NER annotation -class Lvl1BilouGermevalNerTag(token:Token, initialCategory:String) extends NerTag(token, initialCategory) { def domain = BilouGermevalNerDomain } -class LabeledLvl1BilouGermevalNerTag(token:Token, initialCategory:String) extends Lvl1BilouGermevalNerTag(token, initialCategory) with CategoricalLabeling[String] -class Lvl2BilouGermevalNerTag(token:Token, initialCategory:String) extends NerTag(token, initialCategory) { def domain = BilouGermevalNerDomain } -class LabeledLvl2BilouGermevalNerTag(token:Token, initialCategory:String) extends Lvl2BilouGermevalNerTag(token, initialCategory) with CategoricalLabeling[String] + /** Return "PER" instead of "I-PER". */ + def baseCategoryValue: String = if (categoryValue.length > 1 && categoryValue(1) == '-') categoryValue.substring(2) else categoryValue + def isEmpty = categoryValue == "O" // this should always be correct, but it might not be + def spanPrefix = categoryValue.split("-").apply(0) +} \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/nlp/ner/NoEmbeddingsOntonotesStackedChainNer.scala b/src/main/scala/cc/factorie/app/nlp/ner/NoEmbeddingsOntonotesStackedChainNer.scala new file mode 100644 index 0000000..6ae3026 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/ner/NoEmbeddingsOntonotesStackedChainNer.scala @@ -0,0 +1,8 @@ +package cc.factorie.app.nlp.ner + +import java.io.Serializable + +import cc.factorie.util.ModelProvider + +class NoEmbeddingsOntonotesStackedChainNer()(implicit mp:ModelProvider[NoEmbeddingsOntonotesStackedChainNer], nerLexiconFeatures: NerLexiconFeatures) extends OntonotesStackedChainNer(null, 0, 0.0, false)(mp, nerLexiconFeatures) with Serializable +object NoEmbeddingsOntonotesStackedChainNer extends NoEmbeddingsOntonotesStackedChainNer()(ModelProvider.classpath(), StaticLexiconFeatures()) with Serializable \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/nlp/ner/NoEmbeddingsStackedChainNer.scala b/src/main/scala/cc/factorie/app/nlp/ner/NoEmbeddingsStackedChainNer.scala new file mode 100644 index 0000000..2b39955 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/ner/NoEmbeddingsStackedChainNer.scala @@ -0,0 +1,9 @@ +package cc.factorie.app.nlp.ner + +import java.io.Serializable + +import cc.factorie.util.ModelProvider + +class NoEmbeddingsConllStackedChainNer()(implicit mp:ModelProvider[NoEmbeddingsConllStackedChainNer], nerLexiconFeatures:NerLexiconFeatures) extends ConllStackedChainNer(null, 0, 0.0, false)(mp, nerLexiconFeatures) with Serializable +object NoEmbeddingsConllStackedChainNer extends NoEmbeddingsConllStackedChainNer()(ModelProvider.classpath(), StaticLexiconFeatures()) with Serializable + diff --git a/src/main/scala/cc/factorie/app/nlp/ner/OntonotesChainNer.scala b/src/main/scala/cc/factorie/app/nlp/ner/OntonotesChainNer.scala new file mode 100644 index 0000000..0db9a76 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/ner/OntonotesChainNer.scala @@ -0,0 +1,11 @@ +package cc.factorie.app.nlp.ner +/* +class OntonotesChainNer()(implicit mp:ModelProvider[OntonotesChainNer], nerLexiconFeatures:NerLexiconFeatures) + extends ChainNer[BilouOntonotesNerTag](BilouOntonotesNerDomain, (t, s) => new BilouOntonotesNerTag(t, s), l => l.token, mp.provide, nerLexiconFeatures) { + def newBuffer = new OntonotesNerSpanBuffer() + + def newSpan(sec: Section, start: Int, length: Int, category: String) = new OntonotesNerSpan(sec, start, length, category) +} + +object OntonotesChainNer extends OntonotesChainNer()(ModelProvider.classpath(), StaticLexiconFeatures()) +*/ \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/nlp/ner/OntonotesEntityTypeDomain.scala b/src/main/scala/cc/factorie/app/nlp/ner/OntonotesEntityTypeDomain.scala new file mode 100644 index 0000000..f38d5c2 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/ner/OntonotesEntityTypeDomain.scala @@ -0,0 +1,30 @@ +package cc.factorie.app.nlp.ner + +import cc.factorie.variable.EnumDomain + +/** Entity types used in coreference. + * + * @author Andrew McCallum */ +object OntonotesEntityTypeDomain extends EnumDomain { + val O, + CARDINAL, + DATE, + EVENT, + FAC, + GPE, + LANGUAGE, + LAW, + LOC, + MONEY, + NORP, + ORDINAL, + ORG, + PERCENT, + PERSON, + PRODUCT, + QUANTITY, + TIME, + WORK_OF_ART, + MISC = Value + freeze() +} \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/nlp/ner/OntonotesNerDomain.scala b/src/main/scala/cc/factorie/app/nlp/ner/OntonotesNerDomain.scala new file mode 100644 index 0000000..2acc580 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/ner/OntonotesNerDomain.scala @@ -0,0 +1,26 @@ +package cc.factorie.app.nlp.ner + +import cc.factorie.variable.EnumDomain + +object OntonotesNerDomain extends EnumDomain { + val O, + CARDINAL, + DATE, + EVENT, + FAC, + GPE, + LANGUAGE, + LAW, + LOC, + MONEY, + NORP, + ORDINAL, + ORG, + PERCENT, + PERSON, + PRODUCT, + QUANTITY, + TIME, + WORK_OF_ART = Value + freeze() +} diff --git a/src/main/scala/cc/factorie/app/nlp/ner/OntonotesNerSpan.scala b/src/main/scala/cc/factorie/app/nlp/ner/OntonotesNerSpan.scala new file mode 100644 index 0000000..610e7b4 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/ner/OntonotesNerSpan.scala @@ -0,0 +1,5 @@ +package cc.factorie.app.nlp.ner + +import cc.factorie.app.nlp.Section + +class OntonotesNerSpan(section:Section, start:Int, length:Int, category:String) extends NerSpan(section, start, length) with Serializable { val label = new OntonotesNerSpanLabel(this, category) } diff --git a/src/main/scala/cc/factorie/app/nlp/ner/OntonotesNerSpanBuffer.scala b/src/main/scala/cc/factorie/app/nlp/ner/OntonotesNerSpanBuffer.scala new file mode 100644 index 0000000..24ed25b --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/ner/OntonotesNerSpanBuffer.scala @@ -0,0 +1,5 @@ +package cc.factorie.app.nlp.ner + +class OntonotesNerSpanBuffer(spans:Iterable[OntonotesNerSpan]) extends NerSpanBuffer[OntonotesNerSpan] with Serializable { + def this() = this(Iterable.empty[OntonotesNerSpan]) +} diff --git a/src/main/scala/cc/factorie/app/nlp/ner/OntonotesNerSpanLabel.scala b/src/main/scala/cc/factorie/app/nlp/ner/OntonotesNerSpanLabel.scala new file mode 100644 index 0000000..27b5601 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/ner/OntonotesNerSpanLabel.scala @@ -0,0 +1,5 @@ +package cc.factorie.app.nlp.ner + +import cc.factorie.app.nlp.TokenSpan + +class OntonotesNerSpanLabel(span:TokenSpan, initialCategory:String) extends NerSpanLabel(span, initialCategory) with Serializable { def domain = OntonotesNerDomain } \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/nlp/ner/OntonotesNerTag.scala b/src/main/scala/cc/factorie/app/nlp/ner/OntonotesNerTag.scala new file mode 100644 index 0000000..47c87ed --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/ner/OntonotesNerTag.scala @@ -0,0 +1,7 @@ +package cc.factorie.app.nlp.ner + +import cc.factorie.app.nlp.Token + +class OntonotesNerTag(token:Token, initialCategory:String) extends NerTag(token, initialCategory) { + def domain = OntonotesNerDomain +} \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/nlp/ner/OntonotesStackedChainNer.scala b/src/main/scala/cc/factorie/app/nlp/ner/OntonotesStackedChainNer.scala new file mode 100644 index 0000000..322e368 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/ner/OntonotesStackedChainNer.scala @@ -0,0 +1,18 @@ +package cc.factorie.app.nlp.ner + +import cc.factorie.app.nlp.embeddings.SkipGramEmbedding +import cc.factorie.util.ModelProvider + +class OntonotesStackedChainNer(embeddingMap: SkipGramEmbedding, + embeddingDim: Int, + scale: Double, + useOffsetEmbedding: Boolean)(implicit mp:ModelProvider[OntonotesStackedChainNer], nerLexiconFeatures:NerLexiconFeatures) + extends StackedChainNer[BilouOntonotesNerTag]( + BilouOntonotesNerDomain, + (t, s) => new BilouOntonotesNerTag(t, s), + l => l.token, + embeddingMap, + embeddingDim, + scale, + useOffsetEmbedding, + mp.provide, nerLexiconFeatures) diff --git a/src/main/scala/cc/factorie/app/nlp/ner/SpanEncoding.scala b/src/main/scala/cc/factorie/app/nlp/ner/SpanEncoding.scala new file mode 100644 index 0000000..72a06a0 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/ner/SpanEncoding.scala @@ -0,0 +1,26 @@ +package cc.factorie.app.nlp.ner + +import cc.factorie.variable.CategoricalDomain + +import scala.collection.JavaConverters._ + +/** Base trait for label span encodings like BILOU and BIO + * + * @author Kate Silverstein + */ +trait SpanEncoding { + this: CategoricalDomain[String] => + def prefixes: Set[String] + def encodedTags(baseTags: Seq[String]): Seq[String] = Seq("O") ++ baseTags.filter(_ != "O").flatMap(t => prefixes.map(_ + t)) + def suffixIntVal(i: Int): Int = if (i == 0) 0 else ((i - 1)/prefixes.size)+1 + def isLicitTransition(from:String, to:String):Boolean + + def isLicit(from:this.type#Value, to:this.type#Value):Boolean + + final def permittedMask:Set[(Int, Int)] = + (for(t1 <- this._indices.values().asScala; // todo there has to be a better way to get this + t2 <- this._indices.values().asScala + if isLicit(t1, t2)) yield { + //println(s"${t1.category} -> ${t2.category}") + t1.intValue -> t2.intValue }).toSet +} \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/nlp/ner/StackedChainNer.scala b/src/main/scala/cc/factorie/app/nlp/ner/StackedChainNer.scala index 8b8f834..02a1ef2 100644 --- a/src/main/scala/cc/factorie/app/nlp/ner/StackedChainNer.scala +++ b/src/main/scala/cc/factorie/app/nlp/ner/StackedChainNer.scala @@ -1,46 +1,22 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - package cc.factorie.app.nlp.ner -import cc.factorie.app.nlp.lemma.{LowercaseTokenLemma, LowercaseLemmatizer} -import cc.factorie.app.nlp.lexicon.{LexiconsProvider, StaticLexicons} -import cc.factorie.util._ import java.io._ -import cc.factorie._ -import cc.factorie.app.chain._ -import cc.factorie.app.nlp._ -import cc.factorie.app.nlp.embeddings._ -import cc.factorie.app.strings._ +import cc.factorie.app.chain.{ChainModel, Observation} +import cc.factorie.app.nlp.embeddings.SkipGramEmbedding +import cc.factorie.app.nlp.lemma.LowercaseLemmatizer +import cc.factorie.app.nlp.{Document, DocumentAnnotator, Sentence, Token} +import cc.factorie.app.strings.simplifyDigits +import cc.factorie.la.WeightsMapAccumulator import cc.factorie.model.DotFamilyWithStatistics2 -import cc.factorie.optimize.{AdaGrad, ParameterAveraging} -import cc.factorie.util.{BinarySerializer, CmdOptions, HyperparameterMain, JavaHashMap} +import cc.factorie.optimize.{AdaGrad, ParameterAveraging, Trainer} +import cc.factorie.util.{BinarySerializer, CircularBuffer, JavaHashMap} import cc.factorie.variable._ -import cc.factorie.optimize.Trainer -import cc.factorie.la.WeightsMapAccumulator -import scala.reflect.{ClassTag, classTag} +import cc.factorie.{DenseTensor1, Factor, Parameters, Tensor1, la} import scala.collection.mutable.ListBuffer -import scala.io._ import scala.math.round - - -class TokenSequence[T<:NerTag](token: Token)(implicit m: ClassTag[T]) extends collection.mutable.ArrayBuffer[Token] { - this.prepend(token) - val label : String = token.attr[T].categoryValue.split("-")(1) - def key = this.mkString("-") -} +import scala.reflect.{ClassTag, classTag} abstract class StackedChainNer[L<:NerTag](labelDomain: CategoricalDomain[String], newLabel: (Token, String) => L, @@ -260,20 +236,20 @@ abstract class StackedChainNer[L<:NerTag](labelDomain: CategoricalDomain[String] // for (token <- document.tokens) { // val features: ChainNerFeatures = token.attr[ChainNerFeatures] // if(features != null && features.activeCategories.size > 0) { -// val feats: Seq[String] = features.activeCategories.sortWith(_ < _) +// val feats: Seq[String] = features.activeCategories.sortWith(_ < _) // out.println(document.name+":"+token.position+"="+feats.mkString(", ")) // } // } - + def initFeatures(document:Document, vf:Token=>CategoricalVectorVar[String]): Unit = { count=count+1 val tokenSequence = document.tokens.toIndexedSeq //One pass of lemmatising, this should be the same lemmatiser as the one used to construct the lexicon. LowercaseLemmatizer.process(document) - + nerLexiconFeatures.addLexiconFeatures(tokenSequence, vf) - + import cc.factorie.app.strings.simplifyDigits for (token <- document.tokens) { @@ -330,7 +306,7 @@ abstract class StackedChainNer[L<:NerTag](labelDomain: CategoricalDomain[String] } if(aggregate) document.tokens.foreach( aggregateContext(_, vf) ) - + } @@ -519,7 +495,7 @@ abstract class StackedChainNer[L<:NerTag](labelDomain: CategoricalDomain[String] val trainLabels = trainDocuments.flatMap(_.tokens).map(_.attr[L with LabeledMutableDiscreteVar]) //.take(100) val testLabels = testDocuments.flatMap(_.tokens).map(_.attr[L with LabeledMutableDiscreteVar]) //.take(20) - + val vars = for(td <- trainDocuments; sentence <- td.sentences if sentence.length > 1) yield sentence.tokens.map(_.attr[L with LabeledMutableDiscreteVar]) val examples = vars.map(v => new model.ChainLikelihoodExample(v.toSeq)) println("Training with " + examples.length + " examples") @@ -527,13 +503,13 @@ abstract class StackedChainNer[L<:NerTag](labelDomain: CategoricalDomain[String] trainDocuments.foreach(process(_, useModel2=false)) testDocuments.foreach(process(_, useModel2=false)) printEvaluation(trainDocuments, testDocuments, "FINAL 1") - + (trainDocuments ++ testDocuments).foreach( _.tokens.map(token => token.attr += new ChainNer2Features(token))) for(document <- trainDocuments) initFeatures(document, (t:Token)=>t.attr[ChainNer2Features]) for(document <- trainDocuments) initSecondaryFeatures(document) ChainNer2FeaturesDomain.freeze() - + for(document <- testDocuments) initFeatures(document, (t:Token)=>t.attr[ChainNer2Features]) for(document <- testDocuments) initSecondaryFeatures(document) //println(trainDocuments(3).tokens.map(token => token.nerTag.target.categoryValue + " "+token.string+" "+token.attr[ChainNer2Features].toString).mkString("\n")) @@ -601,162 +577,3 @@ abstract class StackedChainNer[L<:NerTag](labelDomain: CategoricalDomain[String] } } -class ConllStackedChainNer(embeddingMap: SkipGramEmbedding, - embeddingDim: Int, - scale: Double, - useOffsetEmbedding: Boolean)(implicit mp:ModelProvider[ConllStackedChainNer], nerLexiconFeatures:NerLexiconFeatures) - extends StackedChainNer[BilouConllNerTag]( - BilouConllNerDomain, - (t, s) => new BilouConllNerTag(t, s), - l => l.token, - embeddingMap, - embeddingDim, - scale, - useOffsetEmbedding, - mp.provide, nerLexiconFeatures) - -//object ConllStackedChainNer extends ConllStackedChainNer(SkipGramEmbedding, 100, 1.0, true, ClasspathURL[ConllStackedChainNer](".factorie")) -class NoEmbeddingsConllStackedChainNer()(implicit mp:ModelProvider[NoEmbeddingsConllStackedChainNer], nerLexiconFeatures:NerLexiconFeatures) extends ConllStackedChainNer(null, 0, 0.0, false)(mp, nerLexiconFeatures) with Serializable -object NoEmbeddingsConllStackedChainNer extends NoEmbeddingsConllStackedChainNer()(ModelProvider.classpath(), StaticLexiconFeatures()) with Serializable - -class OntonotesStackedChainNer(embeddingMap: SkipGramEmbedding, - embeddingDim: Int, - scale: Double, - useOffsetEmbedding: Boolean)(implicit mp:ModelProvider[OntonotesStackedChainNer], nerLexiconFeatures:NerLexiconFeatures) - extends StackedChainNer[BilouOntonotesNerTag]( - BilouOntonotesNerDomain, - (t, s) => new BilouOntonotesNerTag(t, s), - l => l.token, - embeddingMap, - embeddingDim, - scale, - useOffsetEmbedding, - mp.provide, nerLexiconFeatures) - -class NoEmbeddingsOntonotesStackedChainNer()(implicit mp:ModelProvider[NoEmbeddingsOntonotesStackedChainNer], nerLexiconFeatures: NerLexiconFeatures) extends OntonotesStackedChainNer(null, 0, 0.0, false)(mp, nerLexiconFeatures) with Serializable -object NoEmbeddingsOntonotesStackedChainNer extends NoEmbeddingsOntonotesStackedChainNer()(ModelProvider.classpath(), StaticLexiconFeatures()) with Serializable - - -class StackedChainNerOpts extends CmdOptions with SharedNLPCmdOptions{ - val trainFile = new CmdOption("train", "eng.train", "FILE", "CoNLL formatted training file.") - val testFile = new CmdOption("test", "eng.testb", "FILE", "CoNLL formatted test file.") - val dataLoader = new CmdOption("data-loader", "conll2003", "STRING", "Data loader for this format.") - val encoding = new CmdOption("encoding", "UTF-8", "STRING", "Encoding of input files.") - val modelDir = new CmdOption[File]("model", new File("StackedNER.factorie"), "FILE", "File for saving or loading model.") - val runXmlDir = new CmdOption("run-xml", "xml", "DIR", "Directory for reading NYTimes XML data on which to run saved model.") - val brownClusFile = new CmdOption("brown", "", "FILE", "File containing brown clusters.") - val aggregateTokens = new CmdOption("aggregate", true, "BOOLEAN", "Turn on context aggregation feature.") - val rate = new CmdOption("rate", 0.18, "DOUBLE", "Learning rate") - val delta = new CmdOption("delta", 0.066, "DOUBLE", "Learning delta") - val saveModel = new CmdOption("save-model", false, "BOOLEAN", "Whether to save the model") - val runOnlyHere = new CmdOption("runOnlyHere", false, "BOOLEAN", "Run Experiments only on this machine") - - val embeddingDir = new CmdOption("embeddingDir", "", "STRING", "location of embedding file") - val embeddingDim = new CmdOption("embeddingDim", 100, "INT", "embedding dimension") - val embeddingScale = new CmdOption("embeddingScale", 10.0, "FLOAT", "The scale of the embeddings") - val useOffsetEmbedding = new CmdOption("useOffsetEmbeddings", true, "BOOLEAN", "Whether to use offset embeddings") - val lang = new CmdOption("language", "en", "STRING", "Lexicons language.") -} -/* -object ConllStackedChainNerTester extends App { - val opts = new StackedChainNerOpts - opts.parse(args) - val ner = - if(opts.modelDir.wasInvoked) - new ConllStackedChainNer(null: SkipGramEmbedding, opts.embeddingDim.value, opts.embeddingScale.value, opts.useOffsetEmbedding.value)(opts.modelDir.value.toURI.toURL, StaticLexiconFeatures(opts.lang.value)) - else NoEmbeddingsConllStackedChainNer - - val testPortionToTake = if(opts.testPortion.wasInvoked) opts.testPortion.value else 1.0 - val dataLoader = opts.dataLoader.value match { - case "conll2003" => load.LoadConll2003(BILOU=true) - case "conll2002" => load.LoadConll2002(BILOU=true) - } - val testDocsFull = dataLoader.fromFilename(opts.testFile.value, encoding = opts.encoding.value) - val testDocs = testDocsFull.take((testDocsFull.length*testPortionToTake).floor.toInt) - - println(ner.test(testDocs)) -} -*/ -/* -object ConllStackedChainNerTrainer extends HyperparameterMain { - def evaluateParameters(args: Array[String]): Double = { - // Parse command-line - val opts = new StackedChainNerOpts - opts.parse(args) - val skipgram = if (opts.embeddingDir.wasInvoked) - new SkipGramEmbedding(opts.embeddingDir.value, opts.embeddingDim.value) - else - null - val ner = new ConllStackedChainNer(skipgram: SkipGramEmbedding, opts.embeddingDim.value, opts.embeddingScale.value, opts.useOffsetEmbedding.value)(ModelProvider.empty, StaticLexiconFeatures(opts.lang.value)) - - ner.aggregate = opts.aggregateTokens.wasInvoked - - if (opts.brownClusFile.wasInvoked) { - println("Reading brown cluster file " + opts.brownClusFile.value) - for(line <- Source.fromFile(opts.brownClusFile.value).getLines()){ - val splitLine = line.split("\t") - ner.clusters(splitLine(1)) = splitLine(0) - } - } - - val trainPortionToTake = if(opts.trainPortion.wasInvoked) opts.trainPortion.value else 1.0 - val testPortionToTake = if(opts.testPortion.wasInvoked) opts.testPortion.value else 1.0 - - val dataLoader = opts.dataLoader.value match { - case "conll2003" => load.LoadConll2003(BILOU=true) - case "conll2002" => load.LoadConll2002(BILOU=true) - } - val trainDocsFull = dataLoader.fromFilename(opts.trainFile.value, encoding = opts.encoding.value) - val testDocsFull = dataLoader.fromFilename(opts.testFile.value, encoding = opts.encoding.value) - - val trainDocs = trainDocsFull.take((trainDocsFull.length*trainPortionToTake).floor.toInt) - val testDocs = testDocsFull.take((testDocsFull.length*testPortionToTake).floor.toInt) - - val result = ner.train(trainDocs,testDocs, opts.rate.value, opts.delta.value) - if (opts.saveModel.value) { - ner.serialize(new FileOutputStream(opts.modelDir.value)) - } - - if(opts.targetAccuracy.wasInvoked) cc.factorie.assertMinimalAccuracy(result,opts.targetAccuracy.value.toDouble) - - result - } -} -*/ -/* -object ConllStackedChainNerOptimizer { - def main(args: Array[String]) { - val opts = new StackedChainNerOpts - opts.parse(args) - opts.saveModel.setValue(false) - - if (opts.runOnlyHere.value) { - opts.saveModel.setValue(true) - val result = ConllStackedChainNerTrainer.evaluateParameters(args) - println("result: "+ result) - } - else { - val rate = cc.factorie.util.HyperParameter(opts.rate, new cc.factorie.util.LogUniformDoubleSampler(1e-4, 1e4)) - val delta = cc.factorie.util.HyperParameter(opts.delta, new cc.factorie.util.LogUniformDoubleSampler(1e-4, 1e4)) - /* - val ssh = new cc.factorie.util.SSHActorExecutor("apassos", - Seq("avon1", "avon2"), - "/home/apassos/canvas/factorie-test", - "try-log/", - "cc.factorie.app.nlp.parse.DepParser2", - 10, 5) - */ - val qs = new cc.factorie.util.QSubExecutor(60, "cc.factorie.app.nlp.ner.ConllStackedChainNerTrainer") - val optimizer = new cc.factorie.util.HyperParameterSearcher(opts, Seq(rate, delta), qs.execute, 200, 180, 60) - val result = optimizer.optimize() - println("Got results: " + result.mkString(" ")) - opts.saveModel.setValue(true) - println("Running best configuration...") - import scala.concurrent.Await - import scala.concurrent.duration._ - Await.result(qs.execute(opts.values.flatMap(_.unParse).toArray), 5.hours) - println("Done") - } - } -} -*/ \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/nlp/ner/StaticLexiconFeatures.scala b/src/main/scala/cc/factorie/app/nlp/ner/StaticLexiconFeatures.scala index 72c175a..432e801 100644 --- a/src/main/scala/cc/factorie/app/nlp/ner/StaticLexiconFeatures.scala +++ b/src/main/scala/cc/factorie/app/nlp/ner/StaticLexiconFeatures.scala @@ -13,13 +13,11 @@ package cc.factorie.app.nlp.ner import cc.factorie.app.nlp.Token +import cc.factorie.app.nlp.lexicon.{LexiconsProvider, StaticLexicons} import cc.factorie.variable.CategoricalVectorVar -import cc.factorie.app.nlp.lexicon.StaticLexicons -import cc.factorie.app.nlp.lexicon.LexiconsProvider -import cc.factorie.app.nlp.lemma.LowercaseTokenLemma class StaticLexiconFeatures(lexicon:StaticLexicons, lang: String) extends NerLexiconFeatures { - //this block serves to initialize all of the lexicons used by the model before processing + //this block serves to initialize all of the lexicon used by the model before processing lexicon.synchronized { if (lang.equals("en") || lang.equals("all")){ @@ -64,36 +62,38 @@ class StaticLexiconFeatures(lexicon:StaticLexicons, lang: String) extends NerLex lexicon.wikipedia.PersonAndRedirect.toString() lexicon.wikipedia.OrganizationAndRedirect.toString() } - - if (lang.equals("es") || lang.equals("all")){ - - lexicon.spanish.Continents.toString() - lexicon.spanish.Month.toString() - lexicon.spanish.Day.toString() - lexicon.spanish.PersonFirst.toString() - lexicon.spanish.PersonLast.toString() - lexicon.spanish.Person.toString() - lexicon.spanish.PersonHonorific.toString() - lexicon.spanish.Location.toString() - lexicon.spanish.Organization.toString() - lexicon.spanish.Miscellaneous.toString() - lexicon.spanish.OrgSuffix.toString() - lexicon.spanish.Demonym.toString() - - - lexicon.spanish.WikiBook.toString() - lexicon.spanish.WikiEvent.toString() - lexicon.spanish.WikiBusiness.toString() - lexicon.spanish.WikiFilm.toString() - - lexicon.spanish.WikiPerson.toString() - lexicon.spanish.WikiLocation.toString() - lexicon.spanish.WikiOrganization.toString() - lexicon.spanish.WikiLocationAndRedirect.toString() - lexicon.spanish.WikiPersonAndRedirect.toString() - lexicon.spanish.WikiOrganizationAndRedirect.toString() - } - + + /* Removed Spanish language for this api +// if (lang.equals("es") || lang.equals("all")){ +// +// lexicon.spanish.Continents.toString() +// lexicon.spanish.Month.toString() +// lexicon.spanish.Day.toString() +// lexicon.spanish.PersonFirst.toString() +// lexicon.spanish.PersonLast.toString() +// lexicon.spanish.Person.toString() +// lexicon.spanish.PersonHonorific.toString() +// lexicon.spanish.Location.toString() +// lexicon.spanish.Organization.toString() +// lexicon.spanish.Miscellaneous.toString() +// lexicon.spanish.OrgSuffix.toString() +// lexicon.spanish.Demonym.toString() +// +// +// lexicon.spanish.WikiBook.toString() +// lexicon.spanish.WikiEvent.toString() +// lexicon.spanish.WikiBusiness.toString() +// lexicon.spanish.WikiFilm.toString() +// +// lexicon.spanish.WikiPerson.toString() +// lexicon.spanish.WikiLocation.toString() +// lexicon.spanish.WikiOrganization.toString() +// lexicon.spanish.WikiLocationAndRedirect.toString() +// lexicon.spanish.WikiPersonAndRedirect.toString() +// lexicon.spanish.WikiOrganizationAndRedirect.toString() + +// } + */ } @@ -142,7 +142,7 @@ class StaticLexiconFeatures(lexicon:StaticLexicons, lang: String) extends NerLex lexicon.wikipedia.PersonAndRedirect.tagText(tokenSequence,vf,"WIKI-PERSON-REDIRECT") lexicon.wikipedia.OrganizationAndRedirect.tagText(tokenSequence,vf,"WIKI-ORG-REDIRECT") } - +/* if (lang.equals("es") || lang.equals("all")){ lexicon.spanish.Continents.tagText(tokenSequence,vf,"CONTINENT") @@ -171,7 +171,7 @@ class StaticLexiconFeatures(lexicon:StaticLexicons, lang: String) extends NerLex lexicon.spanish.WikiPersonAndRedirect.tagText(tokenSequence,vf,"WIKI-PERSON-REDIRECT") lexicon.spanish.WikiOrganizationAndRedirect.tagText(tokenSequence,vf,"WIKI-ORG-REDIRECT") } - +*/ } } diff --git a/src/main/scala/cc/factorie/app/nlp/ner/TokenSequence.scala b/src/main/scala/cc/factorie/app/nlp/ner/TokenSequence.scala new file mode 100644 index 0000000..6762002 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/ner/TokenSequence.scala @@ -0,0 +1,11 @@ +package cc.factorie.app.nlp.ner + +import cc.factorie.app.nlp.Token + +import scala.reflect.ClassTag + +class TokenSequence[T<:NerTag](token: Token)(implicit m: ClassTag[T]) extends collection.mutable.ArrayBuffer[Token] { + this.prepend(token) + val label : String = token.attr[T].categoryValue.split("-")(1) + def key = this.mkString("-") +} diff --git a/src/main/scala/cc/factorie/app/nlp/ner/WellFormedNer.scala b/src/main/scala/cc/factorie/app/nlp/ner/WellFormedNer.scala index 9601bae..ec4c5ec 100644 --- a/src/main/scala/cc/factorie/app/nlp/ner/WellFormedNer.scala +++ b/src/main/scala/cc/factorie/app/nlp/ner/WellFormedNer.scala @@ -12,9 +12,10 @@ limitations under the License. */ package cc.factorie.app.nlp.ner -import cc.factorie.app.chain.{ChainHelper, ChainCliqueValues} -import cc.factorie.app.nlp.{Document, Token, DocumentAnnotator} -import cc.factorie.la.{Tensor2, DenseTensor2} + +import cc.factorie.app.chain.{ChainCliqueValues, ChainHelper} +import cc.factorie.app.nlp.{Document, DocumentAnnotator, Token} +import cc.factorie.la.{DenseTensor2, Tensor2} import cc.factorie.util.Logger import scala.reflect.{ClassTag, classTag} diff --git a/src/main/scala/cc/factorie/app/nlp/package.scala b/src/main/scala/cc/factorie/app/nlp/package.scala index 8742009..cac20e8 100644 --- a/src/main/scala/cc/factorie/app/nlp/package.scala +++ b/src/main/scala/cc/factorie/app/nlp/package.scala @@ -63,13 +63,13 @@ package object nlp { def bioBoundaries(labels:Seq[String]): Seq[(Int,Int,String)] = iobBoundaries(labels) - /** Command-line options available on all NLP model trainers. - @author David Belanger */ - trait SharedNLPCmdOptions extends cc.factorie.util.CmdOptions { - val targetAccuracy = new CmdOption("target-accuracy", "", "FLOAT", "target accuracy for this NLP model. It will throw an exception if you don't hit this") - val trainPortion = new CmdOption("train-portion", 1.0, "FLOAT", "portion of train to load") - val testPortion = new CmdOption("test-portion", 1.0, "FLOAT", "portion of test to load") - - } +// /** Command-line options available on all NLP model trainers. +// @author David Belanger */ +// trait SharedNLPCmdOptions extends cc.factorie.util.CmdOptions { +// val targetAccuracy = new CmdOption("target-accuracy", "", "FLOAT", "target accuracy for this NLP model. It will throw an exception if you don't hit this") +// val trainPortion = new CmdOption("train-portion", 1.0, "FLOAT", "portion of train to load") +// val testPortion = new CmdOption("test-portion", 1.0, "FLOAT", "portion of test to load") +// +// } } diff --git a/src/main/scala/cc/factorie/app/nlp/parse/CollapsedParseTree.scala b/src/main/scala/cc/factorie/app/nlp/parse/CollapsedParseTree.scala deleted file mode 100644 index b0b65db..0000000 --- a/src/main/scala/cc/factorie/app/nlp/parse/CollapsedParseTree.scala +++ /dev/null @@ -1,455 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -package cc.factorie.app.nlp.parse - -import cc.factorie.app.nlp._ -import cc.factorie.app.nlp.phrase.{Phrase, PhraseList} -import cc.factorie.variable._ - -import scala.collection.mutable -import scala.collection.mutable.ArrayBuffer - -// Representation for a dependency parse - -// TODO I think this should instead be "ParseEdgeLabelDomain". -akm -object CollapsedParseTreeLabelDomain extends EnumDomain { - ParseTreeLabelDomain.foreach(v => index(v.category)) - - //for collapsed parse trees - Seq("about", "above", "across", "after", "against", "around", "at", "as", "before", "behind", "below", "beneath", "beside", "besides", - "between", "beyond", "by", "down", "during", "except", "for", "from", "in", "inside", "into", "like", "near", "of", "off", "on", "out", - "outside", "over", "since", "through", "throughout", "till", "to", "than", "toward", "under", "until", "up", "upon", "via", "with", "without").foreach(index) - - index("") // necessary for empty categories - def defaultCategory = "nn" -} - -object ParseTree2 { - def mutableFromParseTree(pt:ParseTree) = new MutableParseTree(pt.sentence, pt.targetParents, pt.labels.map(_.categoryValue)) - def immutableFromParseTree(pt:ParseTree) = new ImmutableParseTree(pt.sentence, pt.targetParents, pt.labels.map(_.categoryValue)) - def collapsedFromParseTree(pt:ParseTree) = immutableFromParseTree(pt).toCollapsedParseTree -} - -class ParseTreeParent(val tree:ParseTree2, targetIdx:Int = ParseTree.noIndex) extends LabeledIntegerVariable(targetIdx) { - def vertex = if(value >= 0) tree.vertices(value) else null -} -class ParseTreeLabel2(val tree:ParseTree2, targetValue:String = CollapsedParseTreeLabelDomain.defaultCategory) extends LabeledCategoricalVariable(targetValue) { def domain = CollapsedParseTreeLabelDomain } - -//We need different kinds of vertices, s.t. for example simple parse trees and collapsed parse trees can be treated similarly -trait ParseTreeVertex { - def tree:ParseTree2 - def tokens:Seq[Token] - def headToken:Token = tokens.head - def token = headToken -} -class TokenParseTreeVertex(val tree:ParseTree2, override val headToken:Token) extends ParseTreeVertex { - def tokens: Seq[Token] = Seq(headToken) -} -class PhraseParseTreeVertex(val tree:ParseTree2, val phrase:Phrase) extends ParseTreeVertex { - def tokens: Seq[Token] = phrase.tokens - override def headToken: Token = phrase.headToken -} - -trait ParseTree2 { - val sentence:Sentence - - protected val _vertices:Array[ParseTreeVertex] - protected val _labels:Array[ParseTreeLabel2] - protected val _parents:Array[ParseTreeParent] - - private lazy val vertexOfTokenMap = _vertices.flatMap(v => v.tokens.map(_ -> v)).toMap - - //require(_parents.length == _vertices.length) - //require(_labels.length == _vertices.length) - - implicit def parentToInt(p:ParseTreeParent) = p.value - implicit def tokenToVertex(token:Token) = vertexOfTokenMap.getOrElse(token,null) - - //println("ParseTree parents "+theTargetParents.mkString(" ")) - //println(" ParseTree labels "+theTargetLabels.mkString(" ")) - //println(" ParseTree labels "+_labels.map(_.categoryValue).mkString(" ")) - def labels = _labels - def vertices = _vertices - def vertex(idx:Int) = if(idx < 0) null else _vertices(idx) - def parents= _parents - def setParentsToTarget(): Unit = _parents.foreach(p => p.set(p.target.value)(null)) - def numParentsCorrect: Int = _parents.count(_.valueIsTarget) - def parentsAccuracy: Double = numParentsCorrect.toDouble / _parents.length - def numLabelsCorrect: Int = _labels.count(_.valueIsTarget) - def labelsAccuracy: Double = numLabelsCorrect.toDouble / _labels.length - - /** Returns the position in the sentence of the root token. */ - def rootChildIndex: Int = _parents.indexWhere(_.intValue == ParseTree.rootIndex) - /** Return the vertex at the root of the parse tree. The parent of this vertex is null. The parentIndex of this position is -1. */ - def rootChild: ParseTreeVertex = _vertices(rootChildIndex) - - /** Returns the vertex index of the parent of the vertex at position childIndex */ - def parentIndex(childIndex:Int): Int = if (childIndex == ParseTree.rootIndex) ParseTree.noIndex else _parents(childIndex).value - def targetParentIndex(childIndex:Int): Int = if (childIndex == ParseTree.rootIndex) ParseTree.noIndex else _parents(childIndex).target.value - - /** Returns the parent of the vertex at position childIndex */ - def parent(childIndex:Int) = _parents(childIndex) - - /** Returns the parent vertex of the given token */ - def parent(token:Token): ParseTreeVertex = { require(token.sentence eq sentence); tokenToVertex(token) } - - //TODO: all of the following methods are inefficient if the parse tree is fixed, and various things can be precomputed. - // see trait ImParseTree, which can be mixed in with ParseTree - - /** Return the vertex index of the first vertex whose parent is 'parentIndex' */ - protected def firstChild(parentIndex:Int): Int = { - var i = 0 - while ( i < _parents.length) { - if (_parents(i).value == parentIndex) return i - i += 1 - } - -1 - } - - /** Return a list of vertices who are the children of the vertex at vertex position 'parentIndex' */ - def children(parentIndex:Int): Seq[ParseTreeVertex] = { - getChildrenIndices(parentIndex).map(_vertices(_)) - } - - val defaultFilter: Int => Boolean = {x => false} - def getChildrenIndices(parentIndex:Int, filter : Int => Boolean = defaultFilter): Seq[Int] = { - val result = new ArrayBuffer[Int] - var i = 0 - while (i < _parents.length) { - if (_parents(i).value == parentIndex) result += i - i += 1 - } - if(filter == defaultFilter) result - else result.sorted.takeWhile( i => !filter(i)) - } - - def subtree(parentIndex:Int): Seq[Token] = { - getSubtreeInds(parentIndex).map(sentence.tokens(_)) - } - - def getSubtreeInds(parentIndex: Int, filter : Int => Boolean = defaultFilter): Seq[Int] = { - val result = new ArrayBuffer[Int]() - result += parentIndex - result ++= getChildrenIndices(parentIndex, filter).flatMap(getSubtreeInds(_)).distinct - result - } - - def leftChildren(parentIndex:Int): Seq[ParseTreeVertex] = { - val result = new scala.collection.mutable.ArrayBuffer[ParseTreeVertex] - var i = 0 - while (i < parentIndex) { - if (_parents(i).value == parentIndex) result += _vertices(i) - i += 1 - } - result - } - def rightChildren(parentIndex:Int): Seq[ParseTreeVertex] = { - val result = new scala.collection.mutable.ArrayBuffer[ParseTreeVertex] - var i = parentIndex+1 - while (i < _parents.length) { - if (_parents(i).value == parentIndex) result += _vertices(i) - i += 1 - } - result - } - /** Return a list of tokens who are the children of the token at sentence position 'parentIndex' and who also have the indicated label value. */ - def childrenLabeled(parentIndex:Int, labelIntValue:Int): Seq[ParseTreeVertex] = { - val result = new scala.collection.mutable.ArrayBuffer[ParseTreeVertex] - var i = 0 - while (i < _parents.length) { - if (_parents(i).value == parentIndex && _labels(i).intValue == labelIntValue) result += _vertices(i) - i += 1 - } - result - } - def leftChildrenLabeled(parentIndex:Int, labelIntValue:Int): Seq[ParseTreeVertex] = { - val result = new scala.collection.mutable.ArrayBuffer[ParseTreeVertex] - var i = 0 - while (i < parentIndex) { - if (_parents(i).value == parentIndex && _labels(i).intValue == labelIntValue) result += sentence.tokens(i) - i += 1 - } - result - } - def rightChildrenLabeled(parentIndex:Int, labelIntValue:Int): Seq[ParseTreeVertex] = { - val result = new scala.collection.mutable.ArrayBuffer[ParseTreeVertex] - var i = parentIndex+1 - while (i < _parents.length) { - if (_parents(i).value == parentIndex && _labels(i).intValue == labelIntValue) result += _vertices(i) - i += 1 - } - result - } - //def childrenOfLabel(token:Token, labelIntValue:Int): Seq[Token] = childrenOfLabel(token.position - sentence.start, labelIntValue) - //def childrenLabeled(index:Int, labelValue:DiscreteValue): Seq[Token] = childrenLabeled(index, labelValue.intValue) - //def childrenOfLabel(token:Token, labelValue:DiscreteValue): Seq[Token] = childrenOfLabel(token.position - sentence.start, labelValue.intValue) - /** Return the label on the edge from the child at sentence position 'index' to its parent. */ - def label(index:Int): ParseTreeLabel2 = _labels(index) - def copy: ParseTree2 = { - val newTree:ParseTree2 = - if(this.isInstanceOf[MutableParseTree]) - new MutableParseTree(sentence, _parents.map(_.target.intValue), _labels.map(_.target.categoryValue)) - else - new ImmutableParseTree(sentence, _parents.map(_.target.intValue), _labels.map(_.target.categoryValue)) - - for (i <- 0 until sentence.length) { - newTree._parents(i).set(this._parents(i).intValue)(null) - newTree._labels(i).set(this._labels(i).intValue)(null) - } - newTree - } - /** Return the label on the edge from 'childToken' to its parent. */ - //def label(childToken:Token): ParseTreeLabel2 = { require(childToken.sentence eq sentence); label(childToken.position - sentence.start) } - override def toString: String = { - val tokenStrings = { - if (_vertices.forall(_.token.posTag ne null)) - _vertices.map(v => v.tokens.map(_.string).mkString(" ") + "/" + v.token.posTag.categoryValue) - else - _vertices.map(_.tokens.map(_.string).mkString(" ")) - } - val labelStrings = _labels.map(_.value.toString()) - val buff = new StringBuffer() - for (i <- 0 until _vertices.length) - buff.append(i + " " + _parents(i).intValue + " " + tokenStrings(i) + " " + labelStrings(i) + "\n") - buff.toString - } - - def toStringTex:String = { - def texEdges(idx:Int, builder:StringBuilder):StringBuilder = this.children(idx) match { - case empty if empty.isEmpty => builder - case children => children.foreach { vertex => - val childIdx = vertex.token.positionInSentence - val parentIdx = vertex.token.parseParentIndex - val label = vertex.token.parseLabel.categoryValue - builder.append(" \\depedge{%s}{%s}{%s}".format(parentIdx + 1, childIdx + 1, label)).append("\n") // latex uses 1-indexing - texEdges(childIdx, builder) - } - builder - } - val sentenceString = this._vertices.map(_.tokens.map(_.string).mkString(" ")).mkString(""" \& """) + """\\""" - - val rootId = this.rootChildIndex - val rootLabel = this.label(rootId).categoryValue // should always be 'root' - val rootString = " \\deproot{%s}{%s}".format(rootId, rootLabel) - - val sb = new StringBuilder - sb.append("""\begin{dependency}""").append("\n") - sb.append(""" \begin{deptext}""").append("\n") - sb.append(sentenceString).append("\n") - sb.append(""" \end{deptext}""").append("\n") - sb.append(rootString).append("\n") - texEdges(rootId, sb) - sb.append("""\end{dependency}""").append("\n") - - sb.toString() - } - - def toImmutable:ImmutableParseTreeLike = this match { - case t:ImmutableParseTreeLike => t - case _:TokenParseTree => - val newTree = new ImmutableParseTree(sentence, parents.map(_.target.value), labels.map(_.target.categoryValue)) - for (i <- 0 until sentence.length) { - newTree.parent(i).set(_parents(i).intValue)(null) - newTree.label(i).set(_labels(i).intValue)(null) - } - newTree - case _ => throw new IllegalArgumentException(s"There is no conversion from ${this.getClass.getName} into a mutable parse tree") - } - - def toMutable:MutableParseTreeLike = this match { - case t:MutableParseTreeLike => t - case _:TokenParseTree => - val newTree = new MutableParseTree(sentence, parents.map(_.target.value), labels.map(_.target.categoryValue)) - for (i <- 0 until sentence.length) { - newTree.parent(i).set(_parents(i).intValue)(null) - newTree.label(i).set(_labels(i).intValue)(null) - } - newTree - case _ => throw new IllegalArgumentException(s"There is no conversion from ${this.getClass.getName} into a mutable parse tree") - } - - def toCollapsedParseTree = this match { - case t: CollapsedParseTree => t - case t: TokenParseTree => new CollapsedParseTree(t) - case _ => throw new IllegalArgumentException(s"There is no conversion from ${this.getClass.getName} into a collapsed parse tree") - } -} - -class TokenParseTree(val sentence:Sentence, theTargetParents:Seq[Int], theTargetLabels:Seq[String]) extends ParseTree2 { - override protected val _labels: Array[ParseTreeLabel2] = - theTargetLabels.map(s => new ParseTreeLabel2(this, s)).toArray - override protected val _parents: Array[ParseTreeParent] = - theTargetParents.map(p => new ParseTreeParent(this,p)).toArray - override protected val _vertices: Array[ParseTreeVertex] = - sentence.tokens.map(t => new TokenParseTreeVertex(this,t)).toArray -} - -//inefficient functions for retrieving children -class MutableParseTree(sentence:Sentence, targetParents:Seq[Int], targetLabels:Seq[String]) - extends TokenParseTree(sentence, targetParents, targetLabels) with MutableParseTreeLike { - def this(sentence:Sentence) = this(sentence, Array.fill[Int](sentence.length)(ParseTree.noIndex), Array.tabulate(sentence.length)(i => CollapsedParseTreeLabelDomain.defaultCategory)) // Note: this puts in dummy target data which may be confusing -} -//efficient functions for retrieving children -class ImmutableParseTree(sentence:Sentence, targetParents:Seq[Int], targetLabels:Seq[String]) - extends TokenParseTree(sentence, targetParents, targetLabels) with ImmutableParseTreeLike - -//collapses certain phrases into one vertex and prepositions become edges -class CollapsedParseTree(val parseTree:TokenParseTree) extends ParseTree2 with ImmutableParseTreeLike { - override val sentence: Sentence = parseTree.sentence - override protected val (_labels, _parents, _vertices): (Array[ParseTreeLabel2],Array[ParseTreeParent], Array[ParseTreeVertex]) = { - var phraseTokens = mutable.Map[Token,Phrase]() - val doc = sentence.document - val cf = doc.coref - if(cf != null) { - def addPhrase(p:Phrase) = p.foreach(t => if(phraseTokens.get(t).fold(-1)(_.length) < p.length) phraseTokens += t -> p) - val mentions = cf.mentions - //Sometimes there are nested phrases, so the largest should be chosen - mentions.withFilter(m => sentence.start <= m.phrase.start && sentence.end >= m.phrase.end).foreach(m => addPhrase(m.phrase)) - doc.attr.all[PhraseList].foreach(_.withFilter(p => sentence.start <= p.start && sentence.end >= p.end).foreach(addPhrase)) - } - val vertices = ArrayBuffer[ParseTreeVertex]() - val idxMap = sentence.tokens.foldLeft(mutable.HashMap[AnyRef,Int]())((map,t) => { - if(!phraseTokens.contains(t)) { - //collapse simple prepositions - if(parseTree.label(t.positionInSentence).categoryValue != "prep" || parseTree.getChildrenIndices(t.positionInSentence).exists(c => !parseTree.label(c).categoryValue.matches("pobj|pcomp"))) { - map += t -> map.size - vertices += new TokenParseTreeVertex(this,t) - } - } else { - val p = phraseTokens(t) - if(p.headToken == t) { - map += p -> map.size - vertices += new PhraseParseTreeVertex(this,p) - } - } - map - }) - val aLength = idxMap.size - var i = 0 - val parents = Array.ofDim[ParseTreeParent](aLength) - val labels = Array.ofDim[ParseTreeLabel2](aLength) - while(i < aLength) { - val t = vertices(i).token - val l = parseTree.label(t.positionInSentence).categoryValue - var parent = parseTree.parent(t.positionInSentence).vertex - //collapse prepositions - labels(i) = if((l =="pobj"|| l =="pcomp") && !idxMap.contains(parent.token)) { - val labelString = parent.token.lemmaString - parent = parseTree.parent(parent.token.positionInSentence).vertex - new ParseTreeLabel2(this, labelString) - } else new ParseTreeLabel2(this, l) - if(parent != null) { - try { - val parentIdx = idxMap.getOrElse(parent.token, idxMap(phraseTokens(parent.token))) - parents(i) = new ParseTreeParent(this, parentIdx) - } catch { - case e:Throwable => - println(e.printStackTrace()) - } - } else parents(i) = new ParseTreeParent(this, ParseTree.rootIndex) - i += 1 - } - (labels,parents,vertices.toArray) - } -} - -trait MutableParseTreeLike extends ParseTree2 { - /** Set the parent of the token at position 'child' to be at position 'parentIndex'. A parentIndex of -1 indicates the root. */ - def setParent(childIndex:Int, parentIndex:Int): Unit = _parents(childIndex).set(parentIndex)(null) - def setTargetParent(childIndex:Int, parentIndex:Int): Unit = _parents(childIndex).target.set(parentIndex)(null) - /** Set the parent of the token 'child' to be 'parent'. */ - def setParent(child:Token, parent:Token): Unit = { - require(child.sentence eq sentence) - val parentIdx = _vertices.indexWhere(_.tokens.contains(parent)) - val childIdx = _vertices.indexWhere(_.tokens.contains(child)) - _parents(childIdx).set(parentIdx)(null) - } - /** Make the argument the root of the tree. This method does not prevent their being two roots. */ - def setRootChild(token:Token): Unit = setParent(_vertices.indexWhere(_.tokens.contains(token)), -1) - - /** Set the parent of the token 'child' to be 'parent'. */ - def setParent(child:ParseTreeVertex, parent:ParseTreeVertex): Unit = { - require(child.token.sentence eq sentence) - val parentIdx = _vertices.indexOf(parent) - val childIdx = _vertices.indexOf(child) - _parents(childIdx).set(parentIdx)(null) - } - /** Make the argument the root of the tree. This method does not prevent their being two roots. */ - def setRootChild(root:ParseTreeVertex): Unit = setParent(_vertices.indexOf(root), -1) -} - -//Mixin for immutable trees that provides a set of more efficient getters/functions -trait ImmutableParseTreeLike extends ParseTree2 { - lazy val _children = { - val cs = Array.tabulate(_parents.size)(_ => List[Int]()) - ((_parents.length -1) to 0 by -1).foreach(child => { - val parentIdx = _parents(child).value - if(parentIdx >= 0) cs(parentIdx) = child :: cs(parentIdx) - }) - cs - } - - /** Return the token at the root of the parse tree. The parent of this token is null. The parentIndex of this position is -1. */ - override lazy val rootChild: ParseTreeVertex = super.rootChild - - /** Returns the position in the sentence of the root token. */ - override lazy val rootChildIndex: Int = super.rootChildIndex - - /** Return the sentence index of the first token whose parent is 'parentIndex' */ - override protected def firstChild(parentIndex:Int): Int = { - if(parentIndex < 0) rootChildIndex - else _children(parentIndex).headOption.getOrElse(-1) - } - - override def getChildrenIndices(parentIndex:Int, filter : Int => Boolean = defaultFilter): Seq[Int] = { - if(filter == defaultFilter) - _children(parentIndex) - else - _children(parentIndex).filter( i => !filter(i)) - } - - override def leftChildren(parentIndex:Int) = getChildrenIndices(parentIndex).withFilter(_ < parentIndex).map(_vertices(_)) - - override def rightChildren(parentIndex:Int) = getChildrenIndices(parentIndex).withFilter(_ > parentIndex).map(_vertices(_)) - - /** Return a list of tokens who are the children of the token at sentence position 'parentIndex' and who also have the indicated label value. */ - override def childrenLabeled(parentIndex:Int, labelIntValue:Int) = - getChildrenIndices(parentIndex). - withFilter(i => _labels(i).intValue == labelIntValue). - map(_vertices(_)) - - override def leftChildrenLabeled(parentIndex:Int, labelIntValue:Int) = - getChildrenIndices(parentIndex). - withFilter(i => i < parentIndex && _labels(i).intValue == labelIntValue). - map(_vertices(_)) - - override def rightChildrenLabeled(parentIndex:Int, labelIntValue:Int) = - getChildrenIndices(parentIndex). - withFilter(i => i > parentIndex && _labels(i).intValue == labelIntValue). - map(_vertices(_)) -} - -// Example usages: -// token.sentence.attr[ParseTree].parent(token) -// sentence.attr[ParseTree].children(token) -// sentence.attr[ParseTree].setParent(token, parentToken) -// sentence.attr[ParseTree].label(token) -// sentence.attr[ParseTree].label(token).set("SUBJ") - -// Methods also created in Token supporting: -// token.parseParent -// token.setParseParent(parentToken) -// token.parseChildren -// token.parseLabel -// token.leftChildren diff --git a/src/main/scala/cc/factorie/app/nlp/parse/LightweightParseSentence.scala b/src/main/scala/cc/factorie/app/nlp/parse/LightweightParseSentence.scala new file mode 100644 index 0000000..83a3e6c --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/parse/LightweightParseSentence.scala @@ -0,0 +1,24 @@ +package cc.factorie.app.nlp.parse + +import cc.factorie.app.nlp.Sentence + +class LightweightParseSentence(s: Sentence){ + val length: Int = s.length + 1 + val _tokens: Array[LightweightParseToken] = new Array[LightweightParseToken](length-1) + var i = 0; while(i < length-1) { _tokens(i) = new LightweightParseToken(s(i)); i += 1 } + val parse = s.attr[ParseTree] + val goldHeads = Seq(-1) ++ parse._targetParents.map(_ + 1) + val goldLabels = Seq("") ++ parse._labels.map(_.target.categoryValue) + + // we are working with the original sentence, with an additional + // ROOT token that comes at index 0, moving all other indices up by 1: + // idx < 0 -> NULL_TOKEN + // idx = 0 -> ROOT_TOKEN + // 0 < idx < sentence.length+1 -> sentence(idx-1) + // idx > sentence.length -> NULL_TOKEN + def apply(idx: Int) = idx match { + case 0 => RootToken + case i if (i > 0 && i < length) => _tokens(i-1) + case _ => NullToken + } +} diff --git a/src/main/scala/cc/factorie/app/nlp/parse/LightweightParseToken.scala b/src/main/scala/cc/factorie/app/nlp/parse/LightweightParseToken.scala new file mode 100644 index 0000000..659119e --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/parse/LightweightParseToken.scala @@ -0,0 +1,12 @@ +package cc.factorie.app.nlp.parse + +import cc.factorie.app.nlp.Token +import cc.factorie.app.nlp.pos.PosTag + +class LightweightParseToken(t: Token){ + lazy val string = t.string + lazy val posTag = t.attr[PosTag] + lazy val lemma = if(posTag ne null) t.lemmaString else string + lazy val lemmaLower = if(posTag ne null) lemma.toLowerCase else string + lazy val posTagString = if(posTag ne null) posTag.categoryValue else string +} diff --git a/src/main/scala/cc/factorie/app/nlp/parse/NullToken.scala b/src/main/scala/cc/factorie/app/nlp/parse/NullToken.scala new file mode 100644 index 0000000..ee945eb --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/parse/NullToken.scala @@ -0,0 +1,9 @@ +package cc.factorie.app.nlp.parse + +import cc.factorie.app.nlp.Token + +object NullToken extends LightweightParseToken(null.asInstanceOf[Token]){ + override lazy val string = ParserConstants.NULL_STRING + override lazy val lemmaLower = ParserConstants.NULL_STRING + override lazy val posTagString = ParserConstants.NULL_STRING +} diff --git a/src/main/scala/cc/factorie/app/nlp/parse/OntonotesTransitionBasedParser.scala b/src/main/scala/cc/factorie/app/nlp/parse/OntonotesTransitionBasedParser.scala new file mode 100644 index 0000000..ca233e9 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/parse/OntonotesTransitionBasedParser.scala @@ -0,0 +1,4 @@ +package cc.factorie.app.nlp.parse + +class OntonotesTransitionBasedParser(url:java.net.URL) extends TransitionBasedParser(url) +object OntonotesTransitionBasedParser extends OntonotesTransitionBasedParser(cc.factorie.util.ClasspathURL[OntonotesTransitionBasedParser](".factorie")) diff --git a/src/main/scala/cc/factorie/app/nlp/parse/ParseDecision.scala b/src/main/scala/cc/factorie/app/nlp/parse/ParseDecision.scala new file mode 100644 index 0000000..96d00ae --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/parse/ParseDecision.scala @@ -0,0 +1,9 @@ +package cc.factorie.app.nlp.parse + +case class ParseDecision(action: String) { + val Array(lrnS, srpS, label) = action.split(" ") + val leftOrRightOrNo = lrnS.toInt // leftarc-rightarc-noarc + val shiftOrReduceOrPass = srpS.toInt // shift-reduce-pass + override def toString = action + def readableString = s"${ParserConstants(leftOrRightOrNo)} ${ParserConstants(shiftOrReduceOrPass)} $label" +} diff --git a/src/main/scala/cc/factorie/app/nlp/parse/ParseState.scala b/src/main/scala/cc/factorie/app/nlp/parse/ParseState.scala new file mode 100644 index 0000000..1bd7b85 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/parse/ParseState.scala @@ -0,0 +1,123 @@ +package cc.factorie.app.nlp.parse + +import scala.annotation.tailrec +import scala.collection.mutable.Set + + +class ParseState(var stack: Int, var input: Int, val reducedIds: Set[Int], val sentence: LightweightParseSentence) { + val parseSentenceLength = sentence.length + + val headIndices = Array.fill[Int](parseSentenceLength)(-1) + val arcLabels = Array.fill[String](parseSentenceLength)("") + + val leftmostDeps = Array.fill[Int](parseSentenceLength)(-1) + val rightmostDeps = Array.fill[Int](parseSentenceLength)(-1) + + def goldHeads = sentence.goldHeads + def goldLabels = sentence.goldLabels + + def setHead(tokenIndex: Int, headIndex: Int, label: String) { + // set head + headIndices(tokenIndex) = headIndex + arcLabels(tokenIndex) = label + + // update left and rightmost dependents + if(headIndex != -1){ + if (tokenIndex < headIndex) + leftmostDeps(headIndex) = tokenIndex + else + rightmostDeps(headIndex) = tokenIndex + } + } + + @tailrec final def isDescendantOf(firstIndex: Int, secondIndex: Int): Boolean = { + val firstHeadIndex = headIndices(firstIndex) + if (firstHeadIndex == -1) false // firstIndex has no head, so it can't be a descendant + else if (headIndices(firstHeadIndex) == secondIndex) true + else isDescendantOf(firstHeadIndex, secondIndex) + } + + def leftmostDependent(tokenIndex: Int): Int = { + if (tokenIndex == -1) -1 + else leftmostDeps(tokenIndex) + } + + def rightmostDependent(tokenIndex: Int): Int = { + if (tokenIndex == -1) -1 + else rightmostDeps(tokenIndex) + } + + def leftmostDependent2(tokenIndex: Int): Int = { + if (tokenIndex == -1) -1 + else{ + val i = leftmostDeps(tokenIndex) + if (i == -1) -1 + else leftmostDeps(i) + } + } + + def rightmostDependent2(tokenIndex: Int): Int = { + if (tokenIndex == -1) -1 + else { + val i = rightmostDeps(tokenIndex) + if (i == -1) -1 + else rightmostDeps(i) + } + } + + def leftNearestSibling(tokenIndex: Int): Int = { + val tokenHeadIndex = headIndices(tokenIndex) + if(tokenHeadIndex != -1){ + var i = tokenIndex - 1 + while(i >= 0){ + if (headIndices(i) != -1 && headIndices(i) == tokenHeadIndex) + return i + i -= 1 + } + } + -1 + } + + def rightNearestSibling(tokenIndex: Int): Int = { + val tokenHeadIndex = headIndices(tokenIndex) + if(tokenHeadIndex != -1){ + var i = tokenIndex + 1 + while(i < parseSentenceLength){ + if(headIndices(i) != -1 && headIndices(i) == tokenHeadIndex) + return i + i += 1 + } + } + -1 + } + + def inputToken(offset: Int): Int = { + val i = input + offset + if (i < 0 || parseSentenceLength - 1 < i) -1 + else i + } + + def lambdaToken(offset: Int): Int = { + val i = stack + offset + if (i < 0 || parseSentenceLength - 1 < i) -1 + else i + } + + def stackToken(offset: Int): Int = { + if (offset == 0) + return stack + + var off = math.abs(offset) + var dir = if (offset < 0) -1 else 1 + var i = stack + dir + while (0 < i && i < input) { + if (!reducedIds.contains(i)) { + off -= 1 + if (off == 0) + return i + } + i += dir + } + -1 + } +} diff --git a/src/main/scala/cc/factorie/app/nlp/parse/ParseTree.scala b/src/main/scala/cc/factorie/app/nlp/parse/ParseTree.scala index 3d42312..30ea6bf 100644 --- a/src/main/scala/cc/factorie/app/nlp/parse/ParseTree.scala +++ b/src/main/scala/cc/factorie/app/nlp/parse/ParseTree.scala @@ -1,41 +1,10 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - package cc.factorie.app.nlp.parse -import cc.factorie.app.nlp._ -//import cc.factorie.app.nlp.load.LoadOntonotes5 -import cc.factorie.util.Cubbie -import cc.factorie.variable.{EnumDomain, LabeledCategoricalVariable} + +import cc.factorie.app.nlp.{Sentence, Token} import scala.collection.mutable import scala.collection.mutable.ArrayBuffer -// Representation for a dependency parse - -// TODO I think this should instead be "ParseEdgeLabelDomain". -akm -object ParseTreeLabelDomain extends EnumDomain { - val acomp, advcl, advmod, agent, amod, appos, attr, aux, auxpass, cc, ccomp, complm, conj, csubj, csubjpass, - dep, det, dobj, expl, hmod, hyph, infmod, intj, iobj, mark, meta, neg, nmod, nn, npadvmod, nsubj, nsubjpass, - num, number, oprd, parataxis, partmod, pcomp, pobj, poss, possessive, preconj, predet, prep, prt, punct, - quantmod, rcmod, root, xcomp = Value - index("") // necessary for empty categories - freeze() - def defaultCategory = "nn" - def defaultIndex = index(defaultCategory) -} -// TODO I think this should instead be "ParseEdgeLabels extends LabeledCategoricalSeqVariable". -akm -class ParseTreeLabel(val tree:ParseTree, targetValue:String = ParseTreeLabelDomain.defaultCategory) extends LabeledCategoricalVariable(targetValue) { def domain = ParseTreeLabelDomain } - object ParseTree { val rootIndex = -1 val noIndex = -2 @@ -63,7 +32,7 @@ class ParseTree(val sentence:Sentence, theTargetParents:Array[Int], theTargetLab def parentsAccuracy: Double = numParentsCorrect.toDouble / _parents.length def numLabelsCorrect: Int = {var result = 0; for (i <- 0 until _labels.length) if (_labels(i).valueIsTarget) result += 1; result } def labelsAccuracy: Double = numLabelsCorrect.toDouble / _labels.length - /** Returns the position in the sentence of the root token. */ + /** Returns the position in the sentence of the root token. */ def rootChildIndex: Int = firstChild(-1) /** Return the token at the root of the parse tree. The parent of this token is null. The parentIndex of this position is -1. */ def rootChild: Token = sentence.tokens(rootChildIndex) @@ -183,7 +152,7 @@ class ParseTree(val sentence:Sentence, theTargetParents:Array[Int], theTargetLab result } //def childrenOfLabel(token:Token, labelIntValue:Int): Seq[Token] = childrenOfLabel(token.position - sentence.start, labelIntValue) - //def childrenLabeled(index:Int, labelValue:DiscreteValue): Seq[Token] = childrenLabeled(index, labelValue.intValue) + //def childrenLabeled(index:Int, labelValue:DiscreteValue): Seq[Token] = childrenLabeled(index, labelValue.intValue) //def childrenOfLabel(token:Token, labelValue:DiscreteValue): Seq[Token] = childrenOfLabel(token.position - sentence.start, labelValue.intValue) /** Return the label on the edge from the child at sentence position 'index' to its parent. */ def label(index:Int): ParseTreeLabel = _labels(index) @@ -354,37 +323,4 @@ class ParseTree(val sentence:Sentence, theTargetParents:Array[Int], theTargetLab else "" } -} - -// Example usages: -// token.sentence.attr[ParseTree].parent(token) -// sentence.attr[ParseTree].children(token) -// sentence.attr[ParseTree].setParent(token, parentToken) -// sentence.attr[ParseTree].label(token) -// sentence.attr[ParseTree].label(token).set("SUBJ") - -// Methods also created in Token supporting: -// token.parseParent -// token.setParseParent(parentToken) -// token.parseChildren -// token.parseLabel -// token.leftChildren - -class ParseTreeCubbie extends Cubbie { - val parents = IntListSlot("parents") - val labels = StringListSlot("labels") - def newParseTree(s:Sentence): ParseTree = new ParseTree(s) // This will be abstract when ParseTree domain is unfixed - def storeParseTree(pt:ParseTree): this.type = { - parents := pt.parents - labels := pt.labels.map(_.categoryValue) - this - } - def fetchParseTree(s:Sentence): ParseTree = { - val pt = newParseTree(s) - for (i <- 0 until s.length) { - pt.setParent(i, parents.value(i)) - pt.label(i).setCategory(labels.value(i))(null) - } - pt - } } \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/nlp/parse/ParseTreeLabel.scala b/src/main/scala/cc/factorie/app/nlp/parse/ParseTreeLabel.scala new file mode 100644 index 0000000..1bbc109 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/parse/ParseTreeLabel.scala @@ -0,0 +1,7 @@ +package cc.factorie.app.nlp.parse + +import cc.factorie.variable.LabeledCategoricalVariable + + +// TODO I think this should instead be "ParseEdgeLabels extends LabeledCategoricalSeqVariable". -akm +class ParseTreeLabel(val tree:ParseTree, targetValue:String = ParseTreeLabelDomain.defaultCategory) extends LabeledCategoricalVariable(targetValue) { def domain = ParseTreeLabelDomain } diff --git a/src/main/scala/cc/factorie/app/nlp/parse/ParseTreeLabelDomain.scala b/src/main/scala/cc/factorie/app/nlp/parse/ParseTreeLabelDomain.scala new file mode 100644 index 0000000..f5f81a4 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/parse/ParseTreeLabelDomain.scala @@ -0,0 +1,15 @@ +package cc.factorie.app.nlp.parse + +import cc.factorie.variable.EnumDomain + +// TODO I think this should instead be "ParseEdgeLabelDomain". -akm +object ParseTreeLabelDomain extends EnumDomain { + val acomp, advcl, advmod, agent, amod, appos, attr, aux, auxpass, cc, ccomp, complm, conj, csubj, csubjpass, + dep, det, dobj, expl, hmod, hyph, infmod, intj, iobj, mark, meta, neg, nmod, nn, npadvmod, nsubj, nsubjpass, + num, number, oprd, parataxis, partmod, pcomp, pobj, poss, possessive, preconj, predet, prep, prt, punct, + quantmod, rcmod, root, xcomp = Value + index("") // necessary for empty categories + freeze() + def defaultCategory = "nn" + def defaultIndex = index(defaultCategory) +} \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/nlp/parse/ParserConstants.scala b/src/main/scala/cc/factorie/app/nlp/parse/ParserConstants.scala new file mode 100644 index 0000000..70e57cd --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/parse/ParserConstants.scala @@ -0,0 +1,45 @@ +package cc.factorie.app.nlp.parse + +object ParserConstants { + val NOTHING = -1 + + val ROOT_ID = 0 + + val SHIFT = 1 + val REDUCE = 2 + val PASS = 3 + + val LEFT = 4 + val RIGHT = 5 + val NO = 6 + + val TRAINING = 7 + val PREDICTING = 8 + val BOOSTING = 9 + val PREDICTING_FAST = 10 + + val NULL_STRING = "" + val ROOT_STRING = "" + val SEP = "|" + + // for debugging purposes + def apply(i: Int): String = i match { + case NOTHING => "nothing" + + case SHIFT => "shift" + case REDUCE => "reduce" + case PASS => "pass" + + case LEFT => "left" + case RIGHT => "right" + case NO => "no" + + case TRAINING => "training" + case PREDICTING => "predicting" + case BOOSTING => "boosting" + + case ROOT_ID => "root id" + + case _ => throw new Error(s"Integer value $i is not defined in ParserConstants") + } +} diff --git a/src/main/scala/cc/factorie/app/nlp/parse/ProjectiveGraphBasedParser.scala b/src/main/scala/cc/factorie/app/nlp/parse/ProjectiveGraphBasedParser.scala deleted file mode 100644 index e1ff963..0000000 --- a/src/main/scala/cc/factorie/app/nlp/parse/ProjectiveGraphBasedParser.scala +++ /dev/null @@ -1,420 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.app.nlp.parse - -import java.io._ - -import cc.factorie._ -import cc.factorie.app.nlp._ -import cc.factorie.app.nlp.pos.PennPosTag -import cc.factorie.la.{Tensor, WeightsMapAccumulator} -import cc.factorie.model.Parameters -import cc.factorie.optimize._ -import cc.factorie.util.{ClasspathURL, DoubleAccumulator, FileUtils, HyperparameterMain, Threading} -import cc.factorie.variable.{DiscreteDomain, HashFeatureVectorVariable, TensorVar} - -import scala.collection.mutable.ArrayBuffer - -/** A graph-based projective dependency parser. - @author Alexandre Passos */ -class ProjectiveGraphBasedParser extends DocumentAnnotator { - parser => - - def this(url:java.net.URL) = { this(); deserialize(url) } - - object MyFeaturesDomain extends DiscreteDomain(1e7.toInt) // 10 million features - class FeatureVector extends HashFeatureVectorVariable { - def domain = MyFeaturesDomain - } - - def getTokenFeatureVector(t: Token): TensorVar = { - val f = new FeatureVector - val tWord = t.string - val tPos = t.attr[PennPosTag].categoryValue - f += "TOKENPOS="+tPos - f += "TOKENWORD="+tWord - f += "TOKENID="+tPos+"&"+tWord - f - } - - def getParentFeatureVector(p: Token): TensorVar = { - val f = new FeatureVector - val pWord = if (p ne null) p.string else "ROOT" - val pPos = if (p ne null) p.attr[PennPosTag].categoryValue else "ROOTPOS" - f += "PARENTPOS="+pPos - f += "PARENTWORD="+pWord - f += "PARENTID="+pPos+"&"+pWord - assert(f ne null) - f - } - - override def tokenAnnotationString(token:Token): String = { - val sentence = token.sentence - val pt = if (sentence ne null) sentence.attr[ParseTree] else null - if (pt eq null) "_\t_" - else (pt.parentIndex(token.positionInSentence)+1).toString+"\t" - } - - def getPairwiseFeatureVector(t: Token, p: Token): TensorVar = { - val f = new FeatureVector - val tWord = t.string - val tPos = t.attr[PennPosTag].categoryValue - val pWord = if (p ne null) p.string else "ROOT" - val pPos = if (p ne null) p.attr[PennPosTag].categoryValue else "ROOTPOS" - val pPosition = if (p ne null) p.positionInSentence else -1 - val tPosition = t.positionInSentence - val dir = if (pPosition < tPosition) "R" else "L" - f += dir+"WORDPAIR="+tWord+"&"+pWord - f += dir+"POSPAIR="+tPos+"&"+pPos - f += dir+"PARENTPAIRCHILDPOS="+pPos+"&"+pWord+"&"+tPos - f += dir+"PARENTPAIRCHILDWORD="+pPos+"&"+pWord+"&"+tWord - f += dir+"CHILDPAIRPARENTPOS="+tPos+"&"+tWord+"&"+pPos - f += dir+"CHILDPAIRPARENTWORD="+tPos+"&"+tWord+"&"+pWord - f += dir+"JOINTALL="+tPos+"&"+tWord+"&"+pWord+"&"+pPos - if ((p ne null) && (t ne null)) { - val first = if (p.positionInSentence < t.positionInSentence) p else t - val second = if (p.positionInSentence < t.positionInSentence) t else p - var x = first - while (x.sentenceNext ne second) { - x = x.sentenceNext - f += dir+"BETWEENPOS="+pPos+"&"+x.attr[PennPosTag].categoryValue+"&"+tPos - } - val prevHeadPos = if (p.sentenceHasPrev) p.sentencePrev.attr[PennPosTag].categoryValue else "NOPREV" - val prevTokPos = if (t.sentenceHasPrev) t.sentencePrev.attr[PennPosTag].categoryValue else "NOPREV" - val nextHeadPos = if (p.sentenceHasNext) p.sentenceNext.attr[PennPosTag].categoryValue else "NONEXT" - val nextTokPos = if (t.sentenceHasNext) t.sentenceNext.attr[PennPosTag].categoryValue else "NONEXT" - def addFourGramFeature(f: FeatureVector, name: String, a: String, b: String, c: String, d: String): Unit = { - f += name+a+b+c+d - f += 1+name+a+b+c - f += 2+name+a+b+d - f += 3+name+a+c+d - f += 4+name+b+c+d - } - addFourGramFeature(f, dir+"HNhPcC=",pPos,nextHeadPos,prevTokPos,tPos) - addFourGramFeature(f, dir+"PhHPcC=", prevHeadPos, pPos, prevTokPos, tPos) - addFourGramFeature(f, dir+"HNhCNc=", pPos, nextHeadPos, tPos, nextTokPos) - addFourGramFeature(f, dir+"PhHCNc=", prevHeadPos, pPos, tPos, nextTokPos) - val distance = math.abs(t.positionInSentence - p.positionInSentence) - for (i <- 0 to distance) { - f += dir+"EdgeLength>="+i - } - val normDistance = distance*10/ t.sentence.length - for (i <- 0 to normDistance) { - f += dir+"NormDistance>="+i - } - } - f - } - - def groundTruthEdges(s: Sentence): Seq[TensorVar] = { - s.attr[ParseTree]._targetParents.zipWithIndex.flatMap(i => { - val parentToken = if (i._1 < 0) null else s.tokens(i._1) - val childToken = s.tokens(i._2) - Seq(getTokenFeatureVector(childToken), getParentFeatureVector(parentToken), getPairwiseFeatureVector(childToken, parentToken)) - }) - } - object DependencyModel extends Parameters { - val weights = Weights(new cc.factorie.la.DenseTensor1(MyFeaturesDomain.dimensionSize)) - } - - class ProjectiveParser(val weights: Tensor, val sent: Sentence, val negativeExamples: Seq[(Int,Int)]) { - val knownParents = ArrayBuffer[Int]() - val sentLength = sent.length - (0 until sentLength).foreach(e => knownParents.append(-3)) - negativeExamples.foreach(e => knownParents(e._2) = e._1) - val tokenScores = Array.fill(sent.length+1)(0.0) - val parentScores = Array.fill(sent.length+1)(0.0) - parentScores(0) = weights.dot(getParentFeatureVector(null).value) - for (i <- 0 until sentLength) { - tokenScores(i+1) = weights.dot(getTokenFeatureVector(sent.tokens(i)).value) - parentScores(i+1) = weights.dot(getParentFeatureVector(sent.tokens(i)).value) - } - val edgeScores = Array.fill(sent.length+1,sent.length+1)(Double.NaN) - def getEdgeScore(parent: Int, child: Int) : Double = { - assert(parent != child, "can't add an edge from a token to itself") - if (edgeScores(parent)(child).isNaN) { - val loss = if ((child > 0) && (knownParents(child-1) == parent -1)) -1.0 else 0.0 - edgeScores(parent)(child) = if (child > 0) - loss + weights.dot(getPairwiseFeatureVector(sent.tokens(child - 1), if (parent > 0) sent.tokens(parent - 1) else null).value) + tokenScores(child) + parentScores(parent) - else 0 - } - edgeScores(parent)(child) - } - - - class SpanData(val split: Int, val score: Double) - case class LeftComplete(override val split: Int, override val score: Double) extends SpanData(split, score) - case class LeftIncomplete(override val split: Int, override val score: Double) extends SpanData(split, score) - case class RightComplete(override val split: Int, override val score: Double) extends SpanData(split, score) - case class RightIncomplete(override val split: Int, override val score: Double) extends SpanData(split, score) - - def backwardSearch(left: Int, right: Int, d: SpanData, edges: ArrayBuffer[(Int, Int)]) { - if (left >= right) return - d match { - case d: LeftComplete => - val node = lcTable(left)(right) - // println("searching lc " + left + " " + right) - backwardSearch(left, node.split, LeftIncomplete(0, 0), edges) - backwardSearch(node.split, right, LeftComplete(0, 0), edges) - case d: RightComplete => - val node = rcTable(left)(right) - // println("searching rc " + left + " " + right) - backwardSearch(left, node.split, RightComplete(0, 0), edges) - backwardSearch(node.split, right, RightIncomplete(0, 0), edges) - case d: LeftIncomplete => - val node = liTable(left)(right) - // println("searching li " + left + " " + right) - edges.append((left, right)) - backwardSearch(left, node.split, LeftComplete(0, 0), edges) - backwardSearch(node.split+1, right, RightComplete(0, 0), edges) - case d: RightIncomplete => - // println("searching ri " + left + " " + right) - val node = riTable(left)(right) - edges.append((right, left)) - backwardSearch(left, node.split, LeftComplete(0, 0), edges) - backwardSearch(node.split+1, right, RightComplete(0, 0), edges) - } - } - - val lcTable = Array.fill[LeftComplete](sentLength+1, sentLength+1)(null) - val liTable = Array.fill[LeftIncomplete](sentLength+1, sentLength+1)(null) - val rcTable = Array.fill[RightComplete](sentLength+1, sentLength+1)(null) - val riTable = Array.fill[RightIncomplete](sentLength+1, sentLength+1)(null) - - def parse() = { - for (i <- 0 until sentLength+1) { - lcTable(i)(i) = LeftComplete(i, 0) - liTable(i)(i) = LeftIncomplete(i, 0) - rcTable(i)(i) = RightComplete(i, 0) - riTable(i)(i) = RightIncomplete(i, 0) - } - - for (height <- 1 until sentLength+1) { - for (left <- 0 until sentLength+1 - height) { - // here we'll fill the table cell left,left+height - val right = left+height - var bestLI = null.asInstanceOf[LeftIncomplete] - var bestRI = null.asInstanceOf[RightIncomplete] - for (split <- left until left+height) { - val li = LeftIncomplete(split, lcTable(left)(split).score + rcTable(split+1)(right).score) - bestLI = if ((bestLI eq null) || (li.score > bestLI.score)) li else bestLI - val ri = RightIncomplete(split, lcTable(left)(split).score + rcTable(split+1)(right).score) - bestRI = if ((bestRI eq null) || (ri.score > bestRI.score)) ri else bestRI - } - liTable(left)(right) = LeftIncomplete(bestLI.split, bestLI.score + getEdgeScore(left, right)) - riTable(left)(right) = RightIncomplete(bestRI.split, bestRI.score + getEdgeScore(right, left)) - - var bestLC = null.asInstanceOf[LeftComplete] - for (split <- left+1 to left+height) { - // println(left + " " + split + " " + (left+height)) - val lc = LeftComplete(split, liTable(left)(split).score + lcTable(split)(right).score) - bestLC = if ((bestLC eq null) || (lc.score > bestLC.score)) lc else bestLC - } - lcTable(left)(right) = bestLC - - var bestRC = null.asInstanceOf[RightComplete] - for (split <- left until left+height) { - val rc = RightComplete(split, rcTable(left)(split).score + riTable(split)(right).score) - bestRC = if ((bestRC eq null) || (rc.score > bestRC.score)) rc else bestRC - } - rcTable(left)(right) = bestRC - } - } - val finalParse = lcTable(0)(sentLength) - val score = finalParse.score - val edges = ArrayBuffer[(Int, Int)]() - backwardSearch(0, sentLength, finalParse, edges) - val parents = Array.ofDim[Int](sentLength) - var sum = 0.0 - edges.foreach(e => { - sum += getEdgeScore(e._1, e._2) - assert(parents(e._2-1) == 0) - assert(e._2 != 0) - parents(e._2-1) = e._1-1 - }) - assert(math.abs(score - sum) < 0.01*math.abs(score) + 0.0001, "edge scores should match: " + score + " " + sum) - (parents, score) - } - } - - class StructuredPerceptronParsingExample(val sent: Sentence) extends cc.factorie.optimize.Example { - def accumulateValueAndGradient(value: DoubleAccumulator, gradient: WeightsMapAccumulator) { - if (gradient ne null) { - val gt = groundTruthEdges(sent) - gt.foreach(f => gradient.accumulate(DependencyModel.weights, f.value)) - - val projectiveParser = new ProjectiveParser(DependencyModel.weights.value, - sent, - sent.attr[ParseTree].targetParents.zipWithIndex) - val (parents, score) = projectiveParser.parse() - for (i <- 0 until sent.tokens.length) { - gradient.accumulate(DependencyModel.weights, - getPairwiseFeatureVector(sent.tokens(i), if (parents(i) == -1) null else sent.tokens(parents(i))).value, - -1.0) - gradient.accumulate(DependencyModel.weights, - getTokenFeatureVector(sent.tokens(i)).value, - -1.0) - gradient.accumulate(DependencyModel.weights, - getParentFeatureVector(if (parents(i) == -1) null else sent.tokens(parents(i))).value, - -1.0) - if (value ne null) { - val trueParent = sent.attr[ParseTree].parents(i) - val returned = parents(i) - value.accumulate(math.min(0, projectiveParser.getEdgeScore(trueParent+1,i+1) - projectiveParser.getEdgeScore(returned+1, i+1))) - } - } - } - } - } - - def serialize(filename: String) = { - val file = new File(filename); if (file.getParentFile != null && !file.getParentFile.exists) file.getParentFile.mkdirs() - cc.factorie.util.BinarySerializer.serialize(DependencyModel, file) - } - def deserialize(file: String) = cc.factorie.util.BinarySerializer.deserialize(DependencyModel, new File(file)) - def deserialize(url: java.net.URL) = cc.factorie.util.BinarySerializer.deserialize(DependencyModel, new DataInputStream(url.openConnection().getInputStream)) - - def train(trainSentences: Seq[Sentence], testSentences: Seq[Sentence], file: String, nThreads: Int, nIters: Int = 10) { - val examples = trainSentences.map(new StructuredPerceptronParsingExample(_)) - val rng = new scala.util.Random(0) - val opt = new AdaMira(0.1) with ParameterAveraging // new cc.factorie.optimize.AdaGradRDA(1.0, 0.0, 0.0001, 0.0001) - val trainer = new optimize.SynchronizedOptimizerOnlineTrainer(DependencyModel.parameters, opt, maxIterations = 10, nThreads = 1) - for (iteration <- 0 until nIters) { - trainer.processExamples(rng.shuffle(examples)) - opt match { case op: ParameterAveraging => op.setWeightsToAverage(DependencyModel.parameters) } - val t0 = System.currentTimeMillis() - println("Predicting train set..."); Threading.parForeach(trainSentences, nThreads) { s => parse(s) } // Was par - println("Predicting test set..."); Threading.parForeach(testSentences, nThreads) { s => parse(s) } // Was par - println("Processed in " + (trainSentences.map(_.tokens.length).sum+testSentences.map(_.tokens.length).sum)*1000.0/(System.currentTimeMillis()-t0) + " tokens per second") - println("Training UAS = "+ ParserEval.calcUas(trainSentences.map(_.attr[ParseTree]))) - println(" Testing UAS = "+ ParserEval.calcUas(testSentences.map(_.attr[ParseTree]))) - opt match { case op: ParameterAveraging => op.unSetWeightsToAverage(DependencyModel.parameters) } - println() - //println("Saving model...") - //if (file != "") serialize(file + "-iter-"+iteration) - } - println("Finished training.") - opt.finalizeWeights(DependencyModel.parameters) - } - - def test(testSentences:Iterable[Sentence]): (Double, Double, Double, Double) = { - val t0 = System.currentTimeMillis() - testSentences.foreach(parse) - val totalTime = System.currentTimeMillis() - t0 - val totalTokens = testSentences.map(_.tokens.length).sum - val totalSentences = testSentences.size - val pred = testSentences.map(_.attr[ParseTree]) - (ParserEval.calcLas(pred), ParserEval.calcUas(pred), totalTokens*1000.0/totalTime, totalSentences*1000.0/totalTime) - } - - def parse(sent: Sentence) { - val tree = sent.attr.getOrElseUpdate(new ParseTree(sent)) - val parser = new ProjectiveParser(DependencyModel.weights.value, sent, Seq()) - val (parents, _) = parser.parse() - for (i <- 0 until sent.length) tree.setParent(i, parents(i)) - } - - def process(document: Document) = { - document.sentences.foreach(parse) - document - } - def prereqAttrs: Iterable[Class[_]] = List(classOf[Sentence], classOf[pos.PennPosTag]) // TODO Also TokenLemma? But we don't have a lemmatizer that matches the training data - def postAttrs: Iterable[Class[_]] = List(classOf[ParseTree]) -} - -class WSJProjectiveGraphBasedParser(url:java.net.URL) extends ProjectiveGraphBasedParser(url) -object WSJProjectiveGraphBasedParser extends ProjectiveGraphBasedParser(ClasspathURL[WSJProjectiveGraphBasedParser](".factorie")) - -class OntonotesProjectiveGraphBasedParser(url:java.net.URL) extends ProjectiveGraphBasedParser(url) -object OntonotesProjectiveGraphBasedParser extends ProjectiveGraphBasedParser(ClasspathURL[OntonotesProjectiveGraphBasedParser](".factorie")) - -object ProjectiveGraphBasedParserOpts extends cc.factorie.util.DefaultCmdOptions with SharedNLPCmdOptions{ - val trainFile = new CmdOption("train", "", "FILENAME", "Training file.") - val testFile = new CmdOption("test", "", "FILENAME", "Testing file.") - //val devFile = new CmdOption("dev", Nil.asInstanceOf[String], "FILENAME", "CoNLL-2008 dev file") - val trainDir = new CmdOption("trainDir", "", "FILENAME", "Directory containing training files.") - val testDir = new CmdOption("testDir", "", "FILENAME", "Directory containing testing files.") - val ontonotes = new CmdOption("onto", true, "BOOLEAN", "Whether training and testing files are in Ontonotes or CoNLL-2008 format") - val model = new CmdOption("model", "parser-model", "FILE", "File in which to save the trained model.") - val nThreads = new CmdOption("nThreads", Runtime.getRuntime.availableProcessors(), "INT", "Number of threads to use.") -} -/* -object ProjectiveGraphBasedParserTrainer extends HyperparameterMain { - def evaluateParameters(args: Array[String]): Double = { - val opts = ProjectiveGraphBasedParserOpts - opts.parse(args) - - val parser = new ProjectiveGraphBasedParser - - assert(opts.trainFile.wasInvoked || opts.trainDir.wasInvoked) - - // Load the sentences - def loadSentences(fileOpt: opts.CmdOption[String], dirOpt: opts.CmdOption[String]): Seq[Sentence] = { - var fileExt = if (opts.ontonotes.value) ".dep.pmd" else "" - var fileList = Seq.empty[String] - if (fileOpt.wasInvoked) fileList = Seq(fileOpt.value) - if (dirOpt.wasInvoked) fileList ++= FileUtils.getFileListFromDir(dirOpt.value, fileExt) - fileList.flatMap(fname => (if (opts.ontonotes.value) load.LoadOntonotes5.fromFilename(fname) else load.LoadConll2008.fromFilename(fname)).head.sentences.toSeq) - } - - val trainSentencesFull = loadSentences(opts.trainFile, opts.trainDir) - //val devSentencesFull = loadSentences(opts.devFile, opts.devDir) - val testSentencesFull = loadSentences(opts.testFile, opts.testDir) - - val trainPortionToTake = if(opts.trainPortion.wasInvoked) opts.trainPortion.value.toDouble else 1.0 - val testPortionToTake = if(opts.testPortion.wasInvoked) opts.testPortion.value.toDouble else 1.0 - val trainSentences = trainSentencesFull.take((trainPortionToTake*trainSentencesFull.length).floor.toInt) - val testSentences = testSentencesFull.take((testPortionToTake*testSentencesFull.length).floor.toInt) - //val devSentences = devSentencesFull.take((testPortionToTake*devSentencesFull.length).floor.toInt) - - println("Total train sentences: " + trainSentences.size) - println("Total test sentences: " + testSentences.size) -// val trainDoc = load.LoadOntonotes5.fromFilename(opts.trainFile.value).head -// val testDoc = load.LoadOntonotes5.fromFilename(opts.testFile.value).head -// -// val trainPortionToTake = if(opts.trainPortion.wasInvoked) opts.trainPortion.value.toDouble else 1.0 -// val testPortionToTake = if(opts.testPortion.wasInvoked) opts.testPortion.value.toDouble else 1.0 -// val trainSentencesFull = trainDoc.sentences.toSeq -// val trainSentences = trainSentencesFull.take((trainPortionToTake*trainSentencesFull.length).floor.toInt) -// val testSentencesFull = testDoc.sentences.toSeq -// val testSentences = testSentencesFull.take((testPortionToTake*testSentencesFull.length).floor.toInt) - - - // Train - parser.train(trainSentences, testSentences, opts.model.value, math.min(opts.nThreads.value, Runtime.getRuntime.availableProcessors())) - - // Test - println("Predicting train set..."); - val(trainLAS, trainUAS, trainTokSpeed, trainSentSpeed) = parser.test(trainSentences) - println(s"Training UAS=${trainUAS}, LAS=${trainLAS}, ${trainTokSpeed} tokens/sec, ${trainSentSpeed} sentences/sec") - println() - - println("Predicting test set..."); - val(testLAS, testUAS, testTokSpeed, testSentSpeed) = parser.test(testSentences) - println(s"Training UAS=${testUAS}, LAS=${testLAS}, ${testTokSpeed} tokens/sec, ${testSentSpeed} sentences/sec") - - // Print accuracy diagnostics - //println("Predicting train set..."); parser.process(trainDoc) - //println("Predicting test set..."); parser.process(testDoc) -// println("Training UAS = "+ ParserEval.calcUas(trainDoc.sentences.toSeq.map(_.attr[ParseTree]))) -// val testUAS = ParserEval.calcUas(testDoc.sentences.toSeq.map(_.attr[ParseTree])) -// println("Testing UAS = "+ testUAS) - println() - println("Done.") - if(opts.targetAccuracy.wasInvoked) cc.factorie.assertMinimalAccuracy(testUAS,opts.targetAccuracy.value.toDouble) - - testUAS - } - -} -*/ \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/nlp/parse/RootToken.scala b/src/main/scala/cc/factorie/app/nlp/parse/RootToken.scala new file mode 100644 index 0000000..663c687 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/parse/RootToken.scala @@ -0,0 +1,9 @@ +package cc.factorie.app.nlp.parse + +import cc.factorie.app.nlp.Token + +object RootToken extends LightweightParseToken(null.asInstanceOf[Token]){ + override lazy val string = ParserConstants.ROOT_STRING + override lazy val lemmaLower = ParserConstants.ROOT_STRING + override lazy val posTagString = ParserConstants.ROOT_STRING +} \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/nlp/parse/TransitionBasedParser.scala b/src/main/scala/cc/factorie/app/nlp/parse/TransitionBasedParser.scala index 9435ce5..243b1f8 100644 --- a/src/main/scala/cc/factorie/app/nlp/parse/TransitionBasedParser.scala +++ b/src/main/scala/cc/factorie/app/nlp/parse/TransitionBasedParser.scala @@ -1,275 +1,60 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ package cc.factorie.app.nlp.parse -import cc.factorie.app.nlp._ +import java.io._ + +import cc.factorie.app.classify.backend.LinearMulticlassClassifier import cc.factorie.app.nlp.lemma.TokenLemma import cc.factorie.app.nlp.pos.PosTag -import scala.annotation.tailrec -import scala.collection.mutable.{ArrayBuffer, Set} -import java.io._ -import cc.factorie.util._ -import cc.factorie.optimize._ -import scala.concurrent.Await -import cc.factorie.variable._ -import cc.factorie.app.classify.backend._ +import cc.factorie.app.nlp.{Document, DocumentAnnotator, Sentence, Token} import cc.factorie.la._ +import cc.factorie.optimize._ +import cc.factorie.util._ +import cc.factorie.variable.{BinaryFeatureVectorVariable, CategoricalDomain, CategoricalVariable, CategoricalVectorDomain} -class LightweightParseToken(t: Token){ - lazy val string = t.string - lazy val posTag = t.attr[PosTag] - lazy val lemma = if(posTag ne null) t.lemmaString else string - lazy val lemmaLower = if(posTag ne null) lemma.toLowerCase else string - lazy val posTagString = if(posTag ne null) posTag.categoryValue else string -} - -class LightweightParseSentence(s: Sentence){ - val length: Int = s.length + 1 - val _tokens: Array[LightweightParseToken] = new Array[LightweightParseToken](length-1) - var i = 0; while(i < length-1) { _tokens(i) = new LightweightParseToken(s(i)); i += 1 } - val parse = s.attr[ParseTree] - val goldHeads = Seq(-1) ++ parse._targetParents.map(_ + 1) - val goldLabels = Seq("") ++ parse._labels.map(_.target.categoryValue) - - // we are working with the original sentence, with an additional - // ROOT token that comes at index 0, moving all other indices up by 1: - // idx < 0 -> NULL_TOKEN - // idx = 0 -> ROOT_TOKEN - // 0 < idx < sentence.length+1 -> sentence(idx-1) - // idx > sentence.length -> NULL_TOKEN - def apply(idx: Int) = idx match { - case 0 => RootToken - case i if (i > 0 && i < length) => _tokens(i-1) - case _ => NullToken - } -} - -object RootToken extends LightweightParseToken(null.asInstanceOf[Token]){ - override lazy val string = ParserConstants.ROOT_STRING - override lazy val lemmaLower = ParserConstants.ROOT_STRING - override lazy val posTagString = ParserConstants.ROOT_STRING -} -object NullToken extends LightweightParseToken(null.asInstanceOf[Token]){ - override lazy val string = ParserConstants.NULL_STRING - override lazy val lemmaLower = ParserConstants.NULL_STRING - override lazy val posTagString = ParserConstants.NULL_STRING -} - -object ParserConstants { - val NOTHING = -1 - - val ROOT_ID = 0 - - val SHIFT = 1 - val REDUCE = 2 - val PASS = 3 - - val LEFT = 4 - val RIGHT = 5 - val NO = 6 - - val TRAINING = 7 - val PREDICTING = 8 - val BOOSTING = 9 - val PREDICTING_FAST = 10 - - val NULL_STRING = "" - val ROOT_STRING = "" - val SEP = "|" - - // for debugging purposes - def apply(i: Int): String = i match { - case NOTHING => "nothing" - - case SHIFT => "shift" - case REDUCE => "reduce" - case PASS => "pass" - - case LEFT => "left" - case RIGHT => "right" - case NO => "no" - - case TRAINING => "training" - case PREDICTING => "predicting" - case BOOSTING => "boosting" - - case ROOT_ID => "root id" - - case _ => throw new Error(s"Integer value $i is not defined in ParserConstants") - } -} - -case class ParseDecision(action: String) { - val Array(lrnS, srpS, label) = action.split(" ") - val leftOrRightOrNo = lrnS.toInt // leftarc-rightarc-noarc - val shiftOrReduceOrPass = srpS.toInt // shift-reduce-pass - override def toString = action - def readableString = s"${ParserConstants(leftOrRightOrNo)} ${ParserConstants(shiftOrReduceOrPass)} $label" -} - -class ParseState(var stack: Int, var input: Int, val reducedIds: Set[Int], val sentence: LightweightParseSentence) { - val parseSentenceLength = sentence.length - - val headIndices = Array.fill[Int](parseSentenceLength)(-1) - val arcLabels = Array.fill[String](parseSentenceLength)("") - - val leftmostDeps = Array.fill[Int](parseSentenceLength)(-1) - val rightmostDeps = Array.fill[Int](parseSentenceLength)(-1) - - def goldHeads = sentence.goldHeads - def goldLabels = sentence.goldLabels - - def setHead(tokenIndex: Int, headIndex: Int, label: String) { - // set head - headIndices(tokenIndex) = headIndex - arcLabels(tokenIndex) = label - - // update left and rightmost dependents - if(headIndex != -1){ - if (tokenIndex < headIndex) - leftmostDeps(headIndex) = tokenIndex - else - rightmostDeps(headIndex) = tokenIndex - } - } - - @tailrec final def isDescendantOf(firstIndex: Int, secondIndex: Int): Boolean = { - val firstHeadIndex = headIndices(firstIndex) - if (firstHeadIndex == -1) false // firstIndex has no head, so it can't be a descendant - else if (headIndices(firstHeadIndex) == secondIndex) true - else isDescendantOf(firstHeadIndex, secondIndex) - } - - def leftmostDependent(tokenIndex: Int): Int = { - if (tokenIndex == -1) -1 - else leftmostDeps(tokenIndex) - } - - def rightmostDependent(tokenIndex: Int): Int = { - if (tokenIndex == -1) -1 - else rightmostDeps(tokenIndex) - } - - def leftmostDependent2(tokenIndex: Int): Int = { - if (tokenIndex == -1) -1 - else{ - val i = leftmostDeps(tokenIndex) - if (i == -1) -1 - else leftmostDeps(i) - } - } - - def rightmostDependent2(tokenIndex: Int): Int = { - if (tokenIndex == -1) -1 - else { - val i = rightmostDeps(tokenIndex) - if (i == -1) -1 - else rightmostDeps(i) - } - } - - def leftNearestSibling(tokenIndex: Int): Int = { - val tokenHeadIndex = headIndices(tokenIndex) - if(tokenHeadIndex != -1){ - var i = tokenIndex - 1 - while(i >= 0){ - if (headIndices(i) != -1 && headIndices(i) == tokenHeadIndex) - return i - i -= 1 - } - } - -1 - } - - def rightNearestSibling(tokenIndex: Int): Int = { - val tokenHeadIndex = headIndices(tokenIndex) - if(tokenHeadIndex != -1){ - var i = tokenIndex + 1 - while(i < parseSentenceLength){ - if(headIndices(i) != -1 && headIndices(i) == tokenHeadIndex) - return i - i += 1 - } - } - -1 - } - - def inputToken(offset: Int): Int = { - val i = input + offset - if (i < 0 || parseSentenceLength - 1 < i) -1 - else i - } - - def lambdaToken(offset: Int): Int = { - val i = stack + offset - if (i < 0 || parseSentenceLength - 1 < i) -1 - else i - } - - def stackToken(offset: Int): Int = { - if (offset == 0) - return stack - - var off = math.abs(offset) - var dir = if (offset < 0) -1 else 1 - var i = stack + dir - while (0 < i && i < input) { - if (!reducedIds.contains(i)) { - off -= 1 - if (off == 0) - return i - } - i += dir - } - -1 - } -} +import scala.collection.mutable.ArrayBuffer /** Default transition-based dependency parser. */ class TransitionBasedParser extends DocumentAnnotator { private val logger = Logger.getLogger(this.getClass.getName) - def this(stream:InputStream) = { this(); deserialize(stream) } + def this(stream: InputStream) = { + this(); deserialize(stream) + } + def this(file: File) = this(new FileInputStream(file)) - def this(url:java.net.URL) = { + + def this(url: java.net.URL) = { this() val stream = url.openConnection.getInputStream - if (stream.available <= 0) throw new Error("Could not open "+url) - logger.debug("TransitionBasedParser loading from "+url) + if (stream.available <= 0) throw new Error("Could not open " + url) + logger.debug("TransitionBasedParser loading from " + url) deserialize(stream) } object FeaturesDomain extends CategoricalVectorDomain[String] + class NonProjDependencyParserFeatures(val decisionVariable: ParseDecisionVariable) extends BinaryFeatureVectorVariable[String] { def domain = FeaturesDomain + override def skipNonCategories = domain.dimensionDomain.frozen } def addFeatureString(featureVariable: NonProjDependencyParserFeatures, feat: String) = featureVariable += feat // for things that don't need to be checked for - def addFeatureNoNulls(featureVariable: NonProjDependencyParserFeatures, feat: String) = if(!feat.endsWith(ParserConstants.NULL_STRING)) featureVariable += feat + def addFeatureNoNulls(featureVariable: NonProjDependencyParserFeatures, feat: String) = if (!feat.endsWith(ParserConstants.NULL_STRING)) featureVariable += feat + def addConjunctiveFeatureNoNulls(featureVariable: NonProjDependencyParserFeatures, feats: Array[String]) = { val len = feats.length var i = 0 var addFeats = false - while(i < len && !addFeats) { + while (i < len && !addFeats) { if (!feats(i).endsWith(ParserConstants.NULL_STRING)) addFeats = true i += 1 } - if(addFeats){ + if (addFeats) { val sb = new StringBuilder() i = 0 - while(i < len-1) { + while (i < len - 1) { sb.append(feats(i)) sb.append(ParserConstants.SEP) i += 1 @@ -278,11 +63,12 @@ class TransitionBasedParser extends DocumentAnnotator { featureVariable += sb.toString } } + def addConjunctiveFeatureWithNulls(featureVariable: NonProjDependencyParserFeatures, feats: Array[String]) = { val len = feats.length var i = 0 val sb = new StringBuilder() - while(i < len-1) { + while (i < len - 1) { sb.append(feats(i)) sb.append(ParserConstants.SEP) i += 1 @@ -293,8 +79,91 @@ class TransitionBasedParser extends DocumentAnnotator { val predictingNPSR = new NonProjectiveShiftReduce(classify) - object ParseDecisionDomain extends CategoricalDomain[String]{ + //* Takes features and turns them into a parse decision using predict(ParseDecisionVariable => ParseDecision) */ + class NonProjectiveShiftReduce(val predict: ParseDecisionVariable => ParseDecision) { + import ParserConstants._ + + def getParseDecisions(s: LightweightParseSentence): ArrayBuffer[ParseDecisionVariable] = { + val state = new ParseState(0, 1, JavaHashSet[Int](), s) + val decisions = new ArrayBuffer[ParseDecisionVariable] { override val initialSize = 100 } + while(state.input < state.parseSentenceLength) { + if (state.stack < 0) + noShift(state) + else { + val decisionVariable = new ParseDecisionVariable(state) + val label = predict(decisionVariable) + decisions += decisionVariable + transition(state, label) + } + } + decisions + } + + def parse(s: LightweightParseSentence): (Array[Int], Array[String]) = { + val state = new ParseState(0, 1, JavaHashSet[Int](), s) + while(state.input < state.parseSentenceLength) { + if (state.stack < 0) + noShift(state) + else { + val decision = new ParseDecisionVariable(state) + val label = predict(decision) + transition(state, label) + } + } + (state.headIndices, state.arcLabels) + } + + private def transition(state: ParseState, label: ParseDecision) = { + if (label.leftOrRightOrNo == LEFT) { + if (state.stack == ROOT_ID) noShift(state) + else if (state.isDescendantOf(state.inputToken(0), state.stackToken(0))) noPass(state) + else if (label.shiftOrReduceOrPass == REDUCE) leftReduce(label.label, state) + else leftPass(label.label, state) + } + else if (label.leftOrRightOrNo == RIGHT) { + if (state.isDescendantOf(state.stackToken(0), state.inputToken(0))) noPass(state) + else if (label.shiftOrReduceOrPass == SHIFT) rightShift(label.label, state) + else rightPass(label.label, state) + } + else { + if (label.shiftOrReduceOrPass == SHIFT) noShift(state) + else if (label.shiftOrReduceOrPass == REDUCE && state.headIndices(state.stackToken(0)) != -1) noReduce(state) + else noPass(state) + } + } + + private def passAux(state: ParseState): Unit = { + var i = state.stack - 1 + while (i >= 0) { + if (!state.reducedIds.contains(i)) { + state.stack = i + return + } + i -= 1 + } + state.stack = i + } + + private def leftArc(label: String, state: ParseState) { state.setHead(state.stackToken(0), state.inputToken(0), label) } + private def rightArc(label: String, state: ParseState) { state.setHead(state.inputToken(0), state.stackToken(0), label) } + + private def shift(state: ParseState) { state.stack = state.input; state.input += 1 } + private def reduce(state: ParseState) { state.reducedIds.add(state.stack); passAux(state) } + private def pass(state: ParseState) { passAux(state: ParseState) } + + private def noShift(state: ParseState) { shift(state) } + private def noReduce(state: ParseState) { reduce(state) } + private def noPass(state: ParseState) { pass(state) } + private def leftReduce(label: String, state: ParseState) { leftArc(label, state); reduce(state) } + private def leftPass(label: String, state: ParseState) { leftArc(label, state); pass(state) } + private def rightShift(label: String, state: ParseState) { rightArc(label, state); shift(state) } + private def rightPass(label: String, state: ParseState) { rightArc(label, state); pass(state) } + } + + object ParseDecisionDomain extends CategoricalDomain[String] { + import ParserConstants._ + val defaultLabel = ParseTreeLabelDomain.defaultCategory val defaultCategory = NOTHING + " " + NOTHING + " " + defaultLabel this += defaultCategory @@ -302,10 +171,11 @@ class TransitionBasedParser extends DocumentAnnotator { class ParseDecisionVariable(val state: ParseState) extends CategoricalVariable[String] { def domain = ParseDecisionDomain + val features = new NonProjDependencyParserFeatures(this) var target = -1 } - + class ParseDecisionExample(decisionVariable: ParseDecisionVariable, m: LinearMulticlassClassifier, objective: MultivariateOptimizableObjective[Int]) extends Example { def accumulateValueAndGradient(value: DoubleAccumulator, gradient: WeightsMapAccumulator): Unit = { val (obj, objGradient) = objective.valueAndGradient(m.predict(decisionVariable.features.value), decisionVariable.target) @@ -316,8 +186,8 @@ class TransitionBasedParser extends DocumentAnnotator { def computeFeatures(state: ParseState, featureVariable: NonProjDependencyParserFeatures, addFeature: (NonProjDependencyParserFeatures, String) => Unit, addConjunctiveFeature: (NonProjDependencyParserFeatures, Array[String]) => Unit) = { - // don't use growable tensor at test time -- we know the size of the domain - if(FeaturesDomain.dimensionDomain.frozen) + // don't use growable tensor at test time -- we know the size of the domain + if (FeaturesDomain.dimensionDomain.frozen) featureVariable.set(new SparseBinaryTensor1(FeaturesDomain.dimensionDomain.size))(null) else featureVariable.set(new GrowableSparseBinaryTensor1(FeaturesDomain.dimensionDomain))(null) @@ -333,7 +203,7 @@ class TransitionBasedParser extends DocumentAnnotator { val lambdaLemma = "l:m:" + lambdaToken.lemmaLower val lambdaPos = "l:p:" + lambdaToken.posTagString - val betaForm = "b:f:" + betaToken.string + val betaForm = "b:f:" + betaToken.string val betaLemma = "b:m:" + betaToken.lemmaLower val betaPos = "b:p:" + betaToken.posTagString @@ -378,26 +248,26 @@ class TransitionBasedParser extends DocumentAnnotator { val beta2 = state.sentence(beta2Index) val beta3 = state.sentence(beta3Index) - val stackLemma_1 = "s-1:m:"+stack_1.lemmaLower - val lambdaLemma_1 = "l-1:m:"+lambda_1.lemmaLower - val lambdaLemma1 = "l1:m:"+lambda1.lemmaLower - val betaLemma_2 = "b-2:m:"+beta_2.lemmaLower - val betaLemma_1 = "b-1:m:"+beta_1.lemmaLower - val betaLemma1 = "b1:m:"+beta1.lemmaLower - val betaLemma2 = "b2:m:"+beta2.lemmaLower - - val lambdaPos_2 = "l-2:p:"+lambda_2.posTagString - val lambdaPos_1 = "l-1:p:"+lambda_1.posTagString - val lambdaPos1 = "l1:p:"+lambda1.posTagString - val lambdaPos2 = "l2:p:"+lambda2.posTagString - val betaPos_1 = "b-1:p:"+beta_1.posTagString - val betaPos1 = "b1:p:"+beta1.posTagString - - val stackPos_2 = "s-2:p:"+stack_2.posTagString - val stackPos_1 = "s-1:p:"+stack_1.posTagString - val betaPos_2 = "b-2:p:"+beta_2.posTagString - val betaPos2 = "b2:p:"+beta2.posTagString - val betaPos3 = "b3:p:"+beta3.posTagString + val stackLemma_1 = "s-1:m:" + stack_1.lemmaLower + val lambdaLemma_1 = "l-1:m:" + lambda_1.lemmaLower + val lambdaLemma1 = "l1:m:" + lambda1.lemmaLower + val betaLemma_2 = "b-2:m:" + beta_2.lemmaLower + val betaLemma_1 = "b-1:m:" + beta_1.lemmaLower + val betaLemma1 = "b1:m:" + beta1.lemmaLower + val betaLemma2 = "b2:m:" + beta2.lemmaLower + + val lambdaPos_2 = "l-2:p:" + lambda_2.posTagString + val lambdaPos_1 = "l-1:p:" + lambda_1.posTagString + val lambdaPos1 = "l1:p:" + lambda1.posTagString + val lambdaPos2 = "l2:p:" + lambda2.posTagString + val betaPos_1 = "b-1:p:" + beta_1.posTagString + val betaPos1 = "b1:p:" + beta1.posTagString + + val stackPos_2 = "s-2:p:" + stack_2.posTagString + val stackPos_1 = "s-1:p:" + stack_1.posTagString + val betaPos_2 = "b-2:p:" + beta_2.posTagString + val betaPos2 = "b2:p:" + beta2.posTagString + val betaPos3 = "b3:p:" + beta3.posTagString addFeature(featureVariable, stackLemma_1) addFeature(featureVariable, lambdaLemma_1) @@ -448,33 +318,33 @@ class TransitionBasedParser extends DocumentAnnotator { addConjunctiveFeature(featureVariable, Array(betaPos3, lambdaPos, betaPos)) // 2nd order features - val lambdaHeadIndex = if(lambdaIndex > -1) state.headIndices(lambdaIndex) else -1 - val lambdaArcLabel = if(lambdaIndex > -1) state.arcLabels(lambdaIndex) else ParserConstants.NULL_STRING + val lambdaHeadIndex = if (lambdaIndex > -1) state.headIndices(lambdaIndex) else -1 + val lambdaArcLabel = if (lambdaIndex > -1) state.arcLabels(lambdaIndex) else ParserConstants.NULL_STRING val lambdaHeadToken = state.sentence(lambdaHeadIndex) - val lambdaLeftmostDepIndex = if(lambdaIndex > -1) state.leftmostDependent(lambdaIndex) else -1 - val lambdaRightmostDepIndex = if(lambdaIndex > -1) state.rightmostDependent(lambdaIndex) else -1 - val betaLeftmostDepIndex = if(betaIndex > -1) state.leftmostDependent(betaIndex) else -1 - val lambdaLeftNearestSibIndex = if(lambdaIndex > -1) state.leftNearestSibling(lambdaIndex) else -1 + val lambdaLeftmostDepIndex = if (lambdaIndex > -1) state.leftmostDependent(lambdaIndex) else -1 + val lambdaRightmostDepIndex = if (lambdaIndex > -1) state.rightmostDependent(lambdaIndex) else -1 + val betaLeftmostDepIndex = if (betaIndex > -1) state.leftmostDependent(betaIndex) else -1 + val lambdaLeftNearestSibIndex = if (lambdaIndex > -1) state.leftNearestSibling(lambdaIndex) else -1 val lambdaLeftmostDep = state.sentence(lambdaLeftmostDepIndex) val lambdaRightmostDep = state.sentence(lambdaRightmostDepIndex) val betaLeftmostDep = state.sentence(betaLeftmostDepIndex) -// val lambdaLeftNearestSib = state.sentence(lambdaLeftNearestSibIndex) + // val lambdaLeftNearestSib = state.sentence(lambdaLeftNearestSibIndex) val lambdaHeadLemma = "l_h:m:" + lambdaHeadToken.lemmaLower val lambdaHeadPos = "l_h:p:" + lambdaHeadToken.posTagString val lambdaHeadLabel = "l:d:" + lambdaArcLabel - val lambdaLeftmostDepLemma = "l_lmd:m:"+lambdaLeftmostDep.lemmaLower - val lambdaRightmostDepLemma = "l_rmd:m:"+lambdaRightmostDep.lemmaLower - val betaLeftmostDepLemma = "b_lmd:m:"+betaLeftmostDep.lemmaLower - val lambdaLeftmostDepPos = "l_lmd:p:"+lambdaLeftmostDep.posTagString - val lambdaRightmostDepPos = "l_rmd:p:"+lambdaRightmostDep.posTagString - val betaLeftmostDepPos = "b_lmd:p:"+betaLeftmostDep.posTagString + val lambdaLeftmostDepLemma = "l_lmd:m:" + lambdaLeftmostDep.lemmaLower + val lambdaRightmostDepLemma = "l_rmd:m:" + lambdaRightmostDep.lemmaLower + val betaLeftmostDepLemma = "b_lmd:m:" + betaLeftmostDep.lemmaLower + val lambdaLeftmostDepPos = "l_lmd:p:" + lambdaLeftmostDep.posTagString + val lambdaRightmostDepPos = "l_rmd:p:" + lambdaRightmostDep.posTagString + val betaLeftmostDepPos = "b_lmd:p:" + betaLeftmostDep.posTagString - val lambdaLeftmostDepHeadLabel = "l_lmd:d:" + (if(lambdaLeftmostDepIndex > -1 && state.headIndices(lambdaLeftmostDepIndex) > -1) state.arcLabels(lambdaLeftmostDepIndex) else ParserConstants.NULL_STRING) - val lambdaRightmostDepHeadLabel = "l_rmd:d:" + (if(lambdaRightmostDepIndex > -1 && state.headIndices(lambdaRightmostDepIndex) > -1) state.arcLabels(lambdaRightmostDepIndex) else ParserConstants.NULL_STRING) - val lambdaLeftNearestSibHeadLabel = "l_lns:d:" + (if(lambdaLeftNearestSibIndex > -1 && state.headIndices(lambdaLeftNearestSibIndex) > -1) state.arcLabels(lambdaLeftNearestSibIndex) else ParserConstants.NULL_STRING) + val lambdaLeftmostDepHeadLabel = "l_lmd:d:" + (if (lambdaLeftmostDepIndex > -1 && state.headIndices(lambdaLeftmostDepIndex) > -1) state.arcLabels(lambdaLeftmostDepIndex) else ParserConstants.NULL_STRING) + val lambdaRightmostDepHeadLabel = "l_rmd:d:" + (if (lambdaRightmostDepIndex > -1 && state.headIndices(lambdaRightmostDepIndex) > -1) state.arcLabels(lambdaRightmostDepIndex) else ParserConstants.NULL_STRING) + val lambdaLeftNearestSibHeadLabel = "l_lns:d:" + (if (lambdaLeftNearestSibIndex > -1 && state.headIndices(lambdaLeftNearestSibIndex) > -1) state.arcLabels(lambdaLeftNearestSibIndex) else ParserConstants.NULL_STRING) addFeature(featureVariable, lambdaHeadLemma) addFeature(featureVariable, lambdaLeftmostDepLemma) @@ -498,20 +368,20 @@ class TransitionBasedParser extends DocumentAnnotator { addConjunctiveFeature(featureVariable, Array(lambdaLeftNearestSibHeadLabel, lambdaPos, betaPos)) // 3rd order features -// val (lambdaGrandHead, lambdaGrandHeadTok) = if(lambdaTok.hasGrandHead) (Some(lambdaTok.grandHead), Some(lambdaTok.grandHead.depToken)) else (NULL, NULL) - val lambdaGrandHeadIdx = if(lambdaHeadIndex > -1) state.headIndices(lambdaHeadIndex) else -1 + // val (lambdaGrandHead, lambdaGrandHeadTok) = if(lambdaTok.hasGrandHead) (Some(lambdaTok.grandHead), Some(lambdaTok.grandHead.depToken)) else (NULL, NULL) + val lambdaGrandHeadIdx = if (lambdaHeadIndex > -1) state.headIndices(lambdaHeadIndex) else -1 val lambdaGrandHeadToken = state.sentence(lambdaGrandHeadIdx) - val lambdaLeftmostDep2Index = if(lambdaIndex > -1) state.leftmostDependent2(lambdaIndex) else -1 - val lambdaRightmostDep2Index = if(lambdaIndex > -1) state.rightmostDependent2(lambdaIndex) else -1 - val betaLeftmostDep2Index = if(betaIndex > -1) state.leftmostDependent2(betaIndex) else -1 + val lambdaLeftmostDep2Index = if (lambdaIndex > -1) state.leftmostDependent2(lambdaIndex) else -1 + val lambdaRightmostDep2Index = if (lambdaIndex > -1) state.rightmostDependent2(lambdaIndex) else -1 + val betaLeftmostDep2Index = if (betaIndex > -1) state.leftmostDependent2(betaIndex) else -1 val lambdaLeftmostDep2 = state.sentence(lambdaLeftmostDep2Index) val lambdaRightmostDep2 = state.sentence(lambdaRightmostDep2Index) val betaLeftmostDep2 = state.sentence(betaLeftmostDep2Index) val lambdaGrandHeadLemma = "l_h2:m:" + lambdaGrandHeadToken.lemmaLower - val lambdaGrandHeadPos = "l_h2:p:" + lambdaGrandHeadToken.posTagString + val lambdaGrandHeadPos = "l_h2:p:" + lambdaGrandHeadToken.posTagString val lambdaLeftmostDep2Lemma = "l_lmd2:m:" + lambdaLeftmostDep2.lemmaLower val lambdaLeftmostDep2Pos = "l_lmd2:p:" + lambdaLeftmostDep2.posTagString @@ -520,11 +390,11 @@ class TransitionBasedParser extends DocumentAnnotator { val betaLeftmostDep2Lemma = "b_lmd2:m:" + betaLeftmostDep2.lemmaLower val betaLeftmostDep2Pos = "b_lmd2:p:" + betaLeftmostDep2.posTagString - val lambdaGrandHeadLabel = "l_h:d:" + (if(lambdaGrandHeadIdx > -1) state.arcLabels(lambdaGrandHeadIdx) else ParserConstants.NULL_STRING) + val lambdaGrandHeadLabel = "l_h:d:" + (if (lambdaGrandHeadIdx > -1) state.arcLabels(lambdaGrandHeadIdx) else ParserConstants.NULL_STRING) - val lambdaLeftmostDep2Label = "l_lmd2:d" + (if(lambdaLeftmostDep2Index > -1 && state.headIndices(lambdaLeftmostDep2Index) > -1) state.arcLabels(lambdaLeftmostDep2Index) else ParserConstants.NULL_STRING) - val lambdaRightmostDep2Label = "l_rmd2:d" + (if(lambdaRightmostDep2Index > -1 && state.headIndices(lambdaRightmostDep2Index) > -1) state.arcLabels(lambdaRightmostDep2Index) else ParserConstants.NULL_STRING) - val betaLeftmostDep2Label = "b_lmd2:d" + (if(betaLeftmostDep2Index > -1 && state.headIndices(betaLeftmostDep2Index) > -1) state.arcLabels(betaLeftmostDep2Index) else ParserConstants.NULL_STRING) + val lambdaLeftmostDep2Label = "l_lmd2:d" + (if (lambdaLeftmostDep2Index > -1 && state.headIndices(lambdaLeftmostDep2Index) > -1) state.arcLabels(lambdaLeftmostDep2Index) else ParserConstants.NULL_STRING) + val lambdaRightmostDep2Label = "l_rmd2:d" + (if (lambdaRightmostDep2Index > -1 && state.headIndices(lambdaRightmostDep2Index) > -1) state.arcLabels(lambdaRightmostDep2Index) else ParserConstants.NULL_STRING) + val betaLeftmostDep2Label = "b_lmd2:d" + (if (betaLeftmostDep2Index > -1 && state.headIndices(betaLeftmostDep2Index) > -1) state.arcLabels(betaLeftmostDep2Index) else ParserConstants.NULL_STRING) addFeature(featureVariable, lambdaGrandHeadLemma) addFeature(featureVariable, lambdaLeftmostDep2Lemma) @@ -563,10 +433,12 @@ class TransitionBasedParser extends DocumentAnnotator { if (file.getParentFile ne null) file.getParentFile.mkdirs() serialize(new java.io.FileOutputStream(file)) } + def deserialize(file: File): Unit = { - require(file.exists(), "Trying to load non-existent file: '" +file) + require(file.exists(), "Trying to load non-existent file: '" + file) deserialize(new java.io.FileInputStream(file)) } + def serialize(stream: java.io.OutputStream): Unit = { import cc.factorie.util.CubbieConversions._ // Sparsify the evidence weights @@ -578,8 +450,9 @@ class TransitionBasedParser extends DocumentAnnotator { BinarySerializer.serialize(FeaturesDomain.dimensionDomain, dstream) BinarySerializer.serialize(ParseDecisionDomain, dstream) BinarySerializer.serialize(model, dstream) - dstream.close() // TODO Are we really supposed to close here, or is that the responsibility of the caller? + dstream.close() // TODO Are we really supposed to close here, or is that the responsibility of the caller? } + def deserialize(stream: java.io.InputStream): Unit = { import cc.factorie.util.CubbieConversions._ // Get ready to read sparse evidence weights @@ -589,37 +462,40 @@ class TransitionBasedParser extends DocumentAnnotator { import scala.language.reflectiveCalls model.weights.set(new DenseLayeredTensor2(FeaturesDomain.dimensionDomain.size, ParseDecisionDomain.size, new SparseIndexedTensor1(_))) BinarySerializer.deserialize(model, dstream) - logger.debug("TransitionBasedParser model parameters oneNorm "+model.parameters.oneNorm) - dstream.close() // TODO Are we really supposed to close here, or is that the responsibility of the caller? + logger.debug("TransitionBasedParser model parameters oneNorm " + model.parameters.oneNorm) + dstream.close() // TODO Are we really supposed to close here, or is that the responsibility of the caller? } def setParse(parseTree: ParseTree, heads: Array[Int], labels: Array[String]) = { - for(i <- 1 until heads.length){ + for (i <- 1 until heads.length) { val headIndex = heads(i) - parseTree.setParent(i-1, headIndex-1) - parseTree.label(i-1).set(ParseTreeLabelDomain.index(labels(i)))(null) + parseTree.setParent(i - 1, headIndex - 1) + parseTree.label(i - 1).set(ParseTreeLabelDomain.index(labels(i)))(null) } } - val parseDecisionCache = JavaHashMap[String,ParseDecision]() + val parseDecisionCache = JavaHashMap[String, ParseDecision]() + def getParseDecision(s: String): ParseDecision = parseDecisionCache.getOrElseUpdate(s, new ParseDecision(s)) + def classify(v: ParseDecisionVariable) = { computeFeatures(v.state, v.features, addFeatureString, addConjunctiveFeatureWithNulls) getParseDecision(ParseDecisionDomain.category(model.predict(v.features.value).maxIndex)) } + lazy val model = new LinearMulticlassClassifier(ParseDecisionDomain.size, FeaturesDomain.dimensionSize) - def testString(testSentences:Seq[Sentence], extraText: String = "", numThreads: Int = 1): String = { - val(las, uas, tokSpeed, sentSpeed) = if(numThreads > 1) testPar(testSentences, numThreads) else test(testSentences) + def testString(testSentences: Seq[Sentence], extraText: String = "", numThreads: Int = 1): String = { + val (las, uas, tokSpeed, sentSpeed) = if (numThreads > 1) testPar(testSentences, numThreads) else test(testSentences) s"$extraText LAS=$las UAS=$uas ${tokSpeed} tokens/sec ${sentSpeed} sentences/sec" } - def test(testSentences:Seq[Sentence]): (Double, Double, Double, Double) = { + def test(testSentences: Seq[Sentence]): (Double, Double, Double, Double) = { var i = 0 val numSentences = testSentences.size var t0: Long = 0 var totalTime: Long = 0 - while(i < numSentences){ + while (i < numSentences) { t0 = System.currentTimeMillis() process(testSentences(i)) totalTime += System.currentTimeMillis() - t0 @@ -627,16 +503,16 @@ class TransitionBasedParser extends DocumentAnnotator { } val totalTokens = testSentences.map(_.length).sum val pred = testSentences.map(_.attr[ParseTree]) - (ParserEval.calcLas(pred), ParserEval.calcUas(pred), totalTokens*1000.0/totalTime, numSentences*1000.0/totalTime) + (ParserEval.calcLas(pred), ParserEval.calcUas(pred), totalTokens * 1000.0 / totalTime, numSentences * 1000.0 / totalTime) } - def testPar(testSentences:Seq[Sentence], numThreads: Int): (Double, Double, Double, Double) = { + def testPar(testSentences: Seq[Sentence], numThreads: Int): (Double, Double, Double, Double) = { val t0 = System.currentTimeMillis() - Threading.parForeach(testSentences, numThreads){s => process(s)} + Threading.parForeach(testSentences, numThreads) { s => process(s) } val totalTime = System.currentTimeMillis() - t0 val totalTokens = testSentences.map(_.length).sum val pred = testSentences.map(_.attr[ParseTree]) - (ParserEval.calcLas(pred), ParserEval.calcUas(pred), totalTokens*1000.0/totalTime, testSentences.size*1000.0/totalTime) + (ParserEval.calcLas(pred), ParserEval.calcUas(pred), totalTokens * 1000.0 / totalTime, testSentences.size * 1000.0 / totalTime) } def train(trainSentences: Seq[Sentence], testSentences: Seq[Sentence], lrate: Double = 1.0, delta: Double = 0.1, @@ -646,12 +522,12 @@ class TransitionBasedParser extends DocumentAnnotator { logger.debug(s"Initializing trainer (${nThreads} threads)") - val objective = if(useHingeLoss) OptimizableObjectives.hingeMulticlass else OptimizableObjectives.sparseLogMulticlass + val objective = if (useHingeLoss) OptimizableObjectives.hingeMulticlass else OptimizableObjectives.sparseLogMulticlass def evaluate() { - println(model.weights.value.toSeq.count(x => x == 0).toFloat/model.weights.value.length +" sparsity") + println(model.weights.value.toSeq.count(x => x == 0).toFloat / model.weights.value.length + " sparsity") println(testString(trainSentences, "Train ", nThreads)) - println(testString(testSentences, "Test ", nThreads)) + println(testString(testSentences, "Test ", nThreads)) } FeaturesDomain.dimensionDomain.gatherCounts = true @@ -663,17 +539,17 @@ class TransitionBasedParser extends DocumentAnnotator { FeaturesDomain.dimensionDomain.gatherCounts = false println(s"Feature count after count cutoff=$cutoff: ${FeaturesDomain.dimensionDomain.size}") - if(cutoff > 1) { + if (cutoff > 1) { println("Re-generating decisions after feature count cutoff...") trainingVs = null // gc trainingVs = generateDecisions(trainSentences, ParserConstants.TRAINING, nThreads) } ParseDecisionDomain.freeze() println(s"Label (decision) domain size: ${ParseDecisionDomain.size}") - if(debug) ParseDecisionDomain.dimensionDomain.categories.map(c => ParseDecision(c).readableString).foreach(c => println(c)) + if (debug) ParseDecisionDomain.dimensionDomain.categories.map(c => ParseDecision(c).readableString).foreach(c => println(c)) /* Print out features */ - if(debug) { + if (debug) { println(s"Sentence: ${trainSentences.head.tokens.map(_.string).mkString(" ")}") trainingVs.head.foreach(tv => { println(s"Training decision: ${ParseDecision(ParseDecisionDomain.category(tv.target)).readableString}; features: ${ @@ -685,13 +561,13 @@ class TransitionBasedParser extends DocumentAnnotator { val examples = trainingVs.flatten.map(v => new ParseDecisionExample(v, model, objective)).toSeq println("Training...") - val optimizer = new AdaGradRDA(delta=delta, rate=lrate, l1=l1Factor, l2=l2Factor, numExamples=examples.length) - Trainer.onlineTrain(model.parameters, examples, maxIterations=numIterations, optimizer=optimizer, evaluate=evaluate, useParallelTrainer = if(nThreads > 1) true else false, nThreads=nThreads) + val optimizer = new AdaGradRDA(delta = delta, rate = lrate, l1 = l1Factor, l2 = l2Factor, numExamples = examples.length) + Trainer.onlineTrain(model.parameters, examples, maxIterations = numIterations, optimizer = optimizer, evaluate = evaluate, useParallelTrainer = if (nThreads > 1) true else false, nThreads = nThreads) println("Done training") - println(testString(testSentences, "Test ")) + println(testString(testSentences, "Test ")) - val(las, uas, tokPerSec, sentPerSec) = test(testSentences) + val (las, uas, tokPerSec, sentPerSec) = test(testSentences) las } @@ -699,20 +575,26 @@ class TransitionBasedParser extends DocumentAnnotator { // parallelizing this will lead to non-repeatable results // since the order of examples will be different every time val oracle = new NonProjectiveShiftReduce(if (mode == ParserConstants.TRAINING) NonprojectiveGoldOracle.predict else new NonprojectiveBoostingOracle(classify).predict) + def genDecisions(s: Sentence) = oracle.getParseDecisions(new LightweightParseSentence(s)).toArray - (if(nThreads > 1) cc.factorie.util.Threading.parMap(sentences, nThreads)(genDecisions) + + (if (nThreads > 1) cc.factorie.util.Threading.parMap(sentences, nThreads)(genDecisions) else sentences.map(genDecisions)).toArray } // For DocumentAnnotator trait - def process(doc: Document) = { doc.sentences.foreach(process); doc } + def process(doc: Document) = { + doc.sentences.foreach(process); doc + } + def prereqAttrs = Seq(classOf[Sentence], classOf[PosTag], classOf[TokenLemma]) // Sentence also includes Token def postAttrs = Seq(classOf[ParseTree]) - override def tokenAnnotationString(token:Token): String = { + + override def tokenAnnotationString(token: Token): String = { val sentence = token.sentence val pt = if (sentence ne null) sentence.attr[ParseTree] else null if (pt eq null) "_\t_" - else (pt.parentIndex(token.positionInSentence)+1).toString+"\t"+pt.label(token.positionInSentence).categoryValue + else (pt.parentIndex(token.positionInSentence) + 1).toString + "\t" + pt.label(token.positionInSentence).categoryValue } def process(s: Sentence): Sentence = { @@ -722,86 +604,8 @@ class TransitionBasedParser extends DocumentAnnotator { s } - /* Takes features and turns them into a parse decision using predict(ParseDecisionVariable => ParseDecision) */ - class NonProjectiveShiftReduce(val predict: ParseDecisionVariable => ParseDecision) { - import ParserConstants._ - def getParseDecisions(s: LightweightParseSentence): ArrayBuffer[ParseDecisionVariable] = { - val state = new ParseState(0, 1, JavaHashSet[Int](), s) - val decisions = new ArrayBuffer[ParseDecisionVariable] { override val initialSize = 100 } - while(state.input < state.parseSentenceLength) { - if (state.stack < 0) - noShift(state) - else { - val decisionVariable = new ParseDecisionVariable(state) - val label = predict(decisionVariable) - decisions += decisionVariable - transition(state, label) - } - } - decisions - } - - def parse(s: LightweightParseSentence): (Array[Int], Array[String]) = { - val state = new ParseState(0, 1, JavaHashSet[Int](), s) - while(state.input < state.parseSentenceLength) { - if (state.stack < 0) - noShift(state) - else { - val decision = new ParseDecisionVariable(state) - val label = predict(decision) - transition(state, label) - } - } - (state.headIndices, state.arcLabels) - } - - private def transition(state: ParseState, label: ParseDecision) = { - if (label.leftOrRightOrNo == LEFT) { - if (state.stack == ROOT_ID) noShift(state) - else if (state.isDescendantOf(state.inputToken(0), state.stackToken(0))) noPass(state) - else if (label.shiftOrReduceOrPass == REDUCE) leftReduce(label.label, state) - else leftPass(label.label, state) - } - else if (label.leftOrRightOrNo == RIGHT) { - if (state.isDescendantOf(state.stackToken(0), state.inputToken(0))) noPass(state) - else if (label.shiftOrReduceOrPass == SHIFT) rightShift(label.label, state) - else rightPass(label.label, state) - } - else { - if (label.shiftOrReduceOrPass == SHIFT) noShift(state) - else if (label.shiftOrReduceOrPass == REDUCE && state.headIndices(state.stackToken(0)) != -1) noReduce(state) - else noPass(state) - } - } - - private def passAux(state: ParseState): Unit = { - var i = state.stack - 1 - while (i >= 0) { - if (!state.reducedIds.contains(i)) { - state.stack = i - return - } - i -= 1 - } - state.stack = i - } - private def leftArc(label: String, state: ParseState) { state.setHead(state.stackToken(0), state.inputToken(0), label) } - private def rightArc(label: String, state: ParseState) { state.setHead(state.inputToken(0), state.stackToken(0), label) } - - private def shift(state: ParseState) { state.stack = state.input; state.input += 1 } - private def reduce(state: ParseState) { state.reducedIds.add(state.stack); passAux(state) } - private def pass(state: ParseState) { passAux(state: ParseState) } - - private def noShift(state: ParseState) { shift(state) } - private def noReduce(state: ParseState) { reduce(state) } - private def noPass(state: ParseState) { pass(state) } - private def leftReduce(label: String, state: ParseState) { leftArc(label, state); reduce(state) } - private def leftPass(label: String, state: ParseState) { leftArc(label, state); pass(state) } - private def rightShift(label: String, state: ParseState) { rightArc(label, state); shift(state) } - private def rightPass(label: String, state: ParseState) { rightArc(label, state); pass(state) } - } trait NonProjectiveOracle { import ParserConstants._ @@ -872,166 +676,12 @@ class TransitionBasedParser extends DocumentAnnotator { basePredict(decisionVariable) } } -} -class WSJTransitionBasedParser(url:java.net.URL) extends TransitionBasedParser(url) -object WSJTransitionBasedParser extends WSJTransitionBasedParser(cc.factorie.util.ClasspathURL[WSJTransitionBasedParser](".factorie")) - -class OntonotesTransitionBasedParser(url:java.net.URL) extends TransitionBasedParser(url) -object OntonotesTransitionBasedParser extends OntonotesTransitionBasedParser(cc.factorie.util.ClasspathURL[OntonotesTransitionBasedParser](".factorie")) - -class TransitionBasedParserArgs extends cc.factorie.util.DefaultCmdOptions with SharedNLPCmdOptions{ - val trainFiles = new CmdOption("train", Nil.asInstanceOf[List[String]], "FILENAME...", "") - val testFiles = new CmdOption("test", Nil.asInstanceOf[List[String]], "FILENAME...", "") - val trainDir = new CmdOption("train-dir", "", "FILENAME", "Directory containing training files.") - val testDir = new CmdOption("test-dir", "", "FILENAME", "Directory containing test files.") - val devDir = new CmdOption("dev-dir", "", "FILENAME", "Directory containing dev files.") - val devFiles = new CmdOption("dev", Nil.asInstanceOf[List[String]], "FILENAME...", "") - val dataLoader = new CmdOption("loader", "LoadOntonotes5", "STRING", "Class name of data loader to use") - val cutoff = new CmdOption("cutoff", 2, "", "") - val loadModel = new CmdOption("load", "", "", "") - val nThreads = new CmdOption("num-threads", 1, "INT", "How many threads to use during training.") - val useSVM = new CmdOption("use-svm", false, "BOOL", "Whether to use SVMs to train") - val modelDir = new CmdOption("model", "model", "FILENAME", "File in which to save the trained model.") - val boosting = new CmdOption("bootstrapping", 0, "INT", "The number of bootstrapping iterations to use. 0 means no bootstrapping.") - val saveModel = new CmdOption("save-model", true,"BOOLEAN","whether to write out a model file or not") - val l1 = new CmdOption("l1", 0.01, "FLOAT", "l1 regularization weight") - val l2 = new CmdOption("l2", 0.00001, "FLOAT", "l2 regularization weight") - val rate = new CmdOption("rate", 1.0,"FLOAT", "base learning rate") - val maxIters = new CmdOption("max-iterations", 7, "INT", "Number of passes through data during training") - val delta = new CmdOption("delta", 0.1, "FLOAT", "learning rate delta") - val hingeLoss = new CmdOption("hinge", true, "BOOLEAN", "Whether to use hinge or log loss") - val debug = new CmdOption("debug", false, "BOOLEAN", "Whether to print out debugging info for training (generated features)") } -/* -object TransitionBasedParserTrainer extends cc.factorie.util.HyperparameterMain { - def evaluateParameters(args: Array[String]) = { - val opts = new TransitionBasedParserArgs - implicit val random = new scala.util.Random(0) - opts.parse(args) - - assert(opts.trainFiles.wasInvoked || opts.trainDir.wasInvoked) - - def loadSentences(listOpt: opts.CmdOption[List[String]], dirOpt: opts.CmdOption[String]): Seq[Sentence] = { - var fileList = Seq.empty[String] - if (listOpt.wasInvoked) fileList = listOpt.value.toSeq - if (dirOpt.wasInvoked) fileList ++= FileUtils.getFileListFromDir(dirOpt.value) - fileList.flatMap(fname => opts.dataLoader.value match { - case "LoadWSJMalt" => - load.LoadWSJMalt.fromFilename(fname, loadLemma=load.AutoLabel, loadPos=load.AutoLabel, loadParse=load.GoldLabel, loadNer=false, nerBilou=false).head.sentences.toSeq - case "LoadOntonotes5" => - load.LoadOntonotes5.fromFilename(fname, loadLemma=load.AutoLabel, loadPos=load.AutoLabel, loadParse=load.GoldLabel, loadNer=false, nerBilou=false).head.sentences.toSeq - case "LoadConll2008" => - load.LoadConll2008.fromFilename(fname).head.sentences.toSeq - case l => throw new Error(s"Not configured to load data using $l") - }) - } - val sentencesFull = loadSentences(opts.trainFiles, opts.trainDir) - val devSentencesFull = loadSentences(opts.devFiles, opts.devDir) - val testSentencesFull = loadSentences(opts.testFiles, opts.testDir) - - val trainPortionToTake = if(opts.trainPortion.wasInvoked) opts.trainPortion.value else 1.0 - val testPortionToTake = if(opts.testPortion.wasInvoked) opts.testPortion.value else 1.0 - val sentences = sentencesFull.take((trainPortionToTake*sentencesFull.length).floor.toInt) - val testSentences = testSentencesFull.take((testPortionToTake*testSentencesFull.length).floor.toInt) - val devSentences = devSentencesFull.take((testPortionToTake*devSentencesFull.length).floor.toInt) - - println("Total train sentences: " + sentences.size) - println("Total dev sentences: " + devSentences.size) - println("Total test sentences: " + testSentences.size) - - val parser = new TransitionBasedParser() - val testLAS = parser.train(sentences, devSentences, lrate=opts.rate.value, delta=opts.delta.value, cutoff=opts.cutoff.value, - numBoostingIterations=opts.boosting.value, useHingeLoss=opts.hingeLoss.value, useSVM=opts.useSVM.value, - nThreads=opts.nThreads.value, numIterations=opts.maxIters.value, l1Factor=opts.l1.value, - l2Factor=opts.l2.value, debug=opts.debug.value) - - if (opts.saveModel.value) { - val modelUrl: String = if (opts.modelDir.wasInvoked) opts.modelDir.value else opts.modelDir.defaultValue + System.currentTimeMillis().toString + ".factorie" - parser.serialize(new java.io.File(modelUrl)) - val serParser = new TransitionBasedParser - serParser.deserialize(new java.io.File(modelUrl)) - println(serParser.testString(devSentences, "Post serialization test accuracy ")) - } - if(opts.targetAccuracy.wasInvoked) cc.factorie.assertMinimalAccuracy(testLAS,opts.targetAccuracy.value.toDouble) - testLAS - } -} -*/ -/* -object TransitionBasedParserTester { - def main(args: Array[String]) { - val opts = new TransitionBasedParserArgs - opts.parse(args) - assert(opts.testDir.wasInvoked || opts.testFiles.wasInvoked) - - // load model from file if given, - // else if the wsj command line param was specified use wsj model, - // otherwise ontonotes model - val parser = { - if(opts.modelDir.wasInvoked) new TransitionBasedParser(new File(opts.modelDir.value)) - else opts.dataLoader.value match { - case "LoadWSJMalt" => WSJTransitionBasedParser - case "LoadOntonotes5" => OntonotesTransitionBasedParser - } - } - assert(!(opts.testDir.wasInvoked && opts.testFiles.wasInvoked)) - val testFileList = if(opts.testDir.wasInvoked) FileUtils.getFileListFromDir(opts.testDir.value) else opts.testFiles.value.toSeq - - val testPortionToTake = if(opts.testPortion.wasInvoked) opts.testPortion.value else 1.0 - val testSentencesFull = testFileList.flatMap(fname => opts.dataLoader.value match { - case "LoadWSJMalt" => - load.LoadWSJMalt.fromFilename(fname, loadLemma=load.AutoLabel, loadPos=load.AutoLabel, loadParse=load.GoldLabel, loadNer=false, nerBilou=false).head.sentences.toSeq - case "LoadOntonotes5" => - load.LoadOntonotes5.fromFilename(fname, loadLemma=load.AutoLabel, loadPos=load.AutoLabel, loadParse=load.GoldLabel, loadNer=false, nerBilou=false).head.sentences.toSeq - case "LoadConll2008" => - load.LoadConll2008.fromFilename(fname).head.sentences.toSeq - case l => throw new Error(s"Not configured to load data using $l") - }) - val testSentences = testSentencesFull.take((testPortionToTake*testSentencesFull.length).floor.toInt) - - println(parser.testString(testSentences)) - } -} -*/ - -/* -object TransitionBasedParserOptimizer { - def main(args: Array[String]) { - val opts = new TransitionBasedParserArgs - opts.parse(args) - val actuallySaveModel = opts.saveModel.value - opts.saveModel.setValue(false) // don't want to save intermediate models, just the best one - // good for wsj -// val memory = 24 -// val cores = 9 - // good for ontonotes - val memory = 48 - val cores = 20 - opts.nThreads.setValue(cores) // make sure we're using the same amount of cores we're allocating - - val l1 = cc.factorie.util.HyperParameter(opts.l1, new cc.factorie.util.LogUniformDoubleSampler(1e-10, 1e2)) - val l2 = cc.factorie.util.HyperParameter(opts.l2, new cc.factorie.util.LogUniformDoubleSampler(1e-10, 1e2)) - val rate = cc.factorie.util.HyperParameter(opts.rate, new cc.factorie.util.LogUniformDoubleSampler(1e-4, 1e4)) - val delta = cc.factorie.util.HyperParameter(opts.delta, new cc.factorie.util.LogUniformDoubleSampler(1e-4, 1e4)) - // val cutoff = cc.factorie.util.HyperParameter(opts.cutoff, new cc.factorie.util.SampleFromSeq[Int](Seq(1, 2))) - // val bootstrap = cc.factorie.util.HyperParameter(opts.bootstrapping, new cc.factorie.util.SampleFromSeq[Int](Seq(0, 1, 2))) - // val maxit = cc.factorie.util.HyperParameter(opts.maxIters, new cc.factorie.util.SampleFromSeq[Int](Seq(5, 7))) - - val qs = new cc.factorie.util.QSubExecutor(memory, "cc.factorie.app.nlp.parse.TransitionBasedParserTrainer", cores) - val optimizer = new cc.factorie.util.HyperParameterSearcher(opts, Seq(l2, rate, delta), qs.execute, 100, 100, 60) - - val result = optimizer.optimize() - println("Got results: " + result.mkString(" ")) - if(actuallySaveModel) { - opts.saveModel.setValue(true) - println("Running best configuration...") - import scala.concurrent.duration._ - Await.result(qs.execute(opts.values.flatMap(_.unParse).toArray), 2.hours) - } - println("Done") - } -} -*/ \ No newline at end of file + + + + + diff --git a/src/main/scala/cc/factorie/app/nlp/phrase/AnyNerPhraseFinder.scala b/src/main/scala/cc/factorie/app/nlp/phrase/AnyNerPhraseFinder.scala new file mode 100644 index 0000000..77714f6 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/phrase/AnyNerPhraseFinder.scala @@ -0,0 +1,7 @@ +package cc.factorie.app.nlp.phrase + +import cc.factorie.app.nlp.coref.NerPhraseFinder +import cc.factorie.app.nlp.ner.NerSpan + +object AnyNerPhraseFinder extends NerPhraseFinder[NerSpan] + diff --git a/src/main/scala/cc/factorie/app/nlp/phrase/ChainChunker.scala b/src/main/scala/cc/factorie/app/nlp/phrase/ChainChunker.scala deleted file mode 100644 index 5d0aab7..0000000 --- a/src/main/scala/cc/factorie/app/nlp/phrase/ChainChunker.scala +++ /dev/null @@ -1,260 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -/* -package cc.factorie.app.nlp.phrase - -import java.io._ - -import cc.factorie.app.chain.ChainModel -import cc.factorie.app.chain.Observations._ -import cc.factorie.app.nlp._ -import cc.factorie.app.nlp.load._ -import cc.factorie.app.nlp.pos.PennPosTag -import cc.factorie.optimize.Trainer -import cc.factorie.util.{BinarySerializer, HyperparameterMain} -import cc.factorie.variable._ - -import scala.io.Source -import scala.reflect.ClassTag -*/ - -/** - * User: cellier - * Date: 10/7/13 - * Time: 2:49 PM - * Chunker based on Sha & Pereira '03 using a linear chain crf. - */ - - /* - * Takes as a type parameter an extension from load.Load2000.ChunkTag - * BILOUChunkTag and BIOChunkTag can be trained using conll2000 data - * NestedChunkTag requires custom data tagged in the BILOUNestedChunkDomain notation - * For NP retrieval of the tags generated by this class, app.nlp.mention.NPChunkMentionFinder can be used - */ -/* -class ChainChunker[L<:ChunkTag](chunkDomain: CategoricalDomain[String], newChunkLabel: (Token) => L)(implicit m: ClassTag[L]) extends DocumentAnnotator { - def process(document: Document) = { - document.sentences.foreach(s => { - if (s.nonEmpty) { - s.tokens.foreach(t => if (!t.attr.contains(m.runtimeClass)) t.attr += newChunkLabel(t)) - features(s) - model.maximize(s.tokens.map(_.attr[L]))(null) - } - }) - document - } - def prereqAttrs = Seq(classOf[Token], classOf[Sentence],classOf[PennPosTag]) - def postAttrs = Seq(m.runtimeClass) - def tokenAnnotationString(token: Token) = { val label = token.attr[L]; if (label ne null) label.categoryValue else "(null)" } - - def serialize(stream: OutputStream) { - import cc.factorie.util.CubbieConversions._ - val dstream = new DataOutputStream(stream) - BinarySerializer.serialize(ChunkFeaturesDomain.dimensionDomain, dstream) - BinarySerializer.serialize(model, dstream) - dstream.close() - } - def deserialize(stream: InputStream) { - import cc.factorie.util.CubbieConversions._ - val dstream = new DataInputStream(stream) - BinarySerializer.deserialize(ChunkFeaturesDomain.dimensionDomain, dstream) - BinarySerializer.deserialize(model, dstream) - dstream.close() - } - - def train(trainSentences:Seq[Sentence], testSentences:Seq[Sentence], useFullFeatures:Boolean = false, lrate:Double = 0.1, decay:Double = 0.01, cutoff:Int = 2, doBootstrap:Boolean = true, useHingeLoss:Boolean = false, numIterations: Int = 5, l1Factor:Double = 0.000001, l2Factor:Double = 0.000001)(implicit random: scala.util.Random) { - ChunkFeaturesDomain.setFeatureSet(useFullFeatures) - trainSentences.foreach(s=>features(s)) - print("Features for Training Generated: ") - if(useFullFeatures) println("Full Set") else println("Subset Set") - ChunkFeaturesDomain.freeze() - testSentences.foreach(features) - - def evaluate() { - (trainSentences ++ testSentences).foreach(s => model.maximize(s.tokens.map(_.attr[L]))(null)) - val segmentEvaluation = new cc.factorie.app.chain.SegmentEvaluation[L](chunkDomain.categories.filter(_.length > 2).map(_.substring(2))) - for (sentence <- testSentences) segmentEvaluation += sentence.tokens.map(_.attr[L]) - println(segmentEvaluation) - println("Train accuracy: "+ HammingObjective.accuracy(trainSentences.flatMap(s => s.tokens.map(_.attr[L])))) - println("Test accuracy: "+ HammingObjective.accuracy(testSentences.flatMap(s => s.tokens.map(_.attr[L])))) - } - val examples = trainSentences.map(sentence => new model.ChainStructuredSVMExample(sentence.tokens.map(_.attr[L]))).toSeq - val optimizer = new cc.factorie.optimize.AdaGradRDA(rate=lrate, l1=l1Factor/examples.length, l2=l2Factor/examples.length) - Trainer.onlineTrain(model.parameters, examples, maxIterations=numIterations, optimizer=optimizer, evaluate=evaluate, useParallelTrainer = false) - } - - object ChunkFeaturesDomain extends CategoricalVectorDomain[String]{var fullFeatureSet: Boolean = false; def setFeatureSet(full:Boolean){fullFeatureSet = full}} - - class ChunkFeatures(val token:Token) extends BinaryFeatureVectorVariable[String] { def domain = ChunkFeaturesDomain; override def skipNonCategories = true } - - val model = new ChainModel[ChunkTag, ChunkFeatures, Token](chunkDomain, - ChunkFeaturesDomain, - l => l.token.attr[ChunkFeatures], - l => l.token, - t => t.attr[L]){ - useObsMarkov = false - } - - def features(sentence: Sentence): Unit = { - import cc.factorie.app.strings.simplifyDigits - val tokens = sentence.tokens.zipWithIndex - for ((token,i) <- tokens) { - if(token.attr[ChunkFeatures] ne null) - token.attr.remove[ChunkFeatures] - val features = token.attr += new ChunkFeatures(token) - val rawWord = token.string - val posTag = token.attr[PennPosTag] - features += "SENTLOC="+i - features += "P="+posTag - features += "Raw="+rawWord - val shape = cc.factorie.app.strings.stringShape(rawWord, 2) - features += "WS="+shape - if (token.isPunctuation) features += "PUNCTUATION" - if(ChunkFeaturesDomain.fullFeatureSet){ - val word = simplifyDigits(rawWord).toLowerCase - if (word.length > 5) { features += "P="+cc.factorie.app.strings.prefix(word, 4); features += "S="+cc.factorie.app.strings.suffix(word, 4) } - features += "STEM=" + cc.factorie.app.strings.porterStem(word) - features += "WSIZE=" + rawWord.length - - } - features += "BIAS" - } - addNeighboringFeatureConjunctions(sentence.tokens, (t: Token) => t.attr[ChunkFeatures], "W=[^@]*$", List(-2), List(-1), List(1),List(2), List(-1,0), List(0,1)) - addNeighboringFeatureConjunctions(sentence.tokens, (t: Token) => t.attr[ChunkFeatures], "P=[^@]*$", List(-2), List(-1), List(1), List(2), List(-2,-1), List(-1,0), List(0,1), List(1,2),List(-2,-1,0),List(-1,0,1),List(0,1,2)) - } -} - -object BILOUChainChunker extends ChainChunker[BILOUChunkTag](BILOUChunkDomain.dimensionDomain, (t) => new BILOUChunkTag(t,"O")) { - deserialize(new FileInputStream(new java.io.File("BILOUChainChunker.factorie"))) -} - -object BIOChainChunker extends ChainChunker[BIOChunkTag](BIOChunkDomain.dimensionDomain, (t) => new BIOChunkTag(t,"O")) { - deserialize(new FileInputStream(new java.io.File("BIOChainChunker.factorie"))) -} - -object NestedChainChunker extends ChainChunker[BILOUNestedChunkTag](BILOUNestedChunkDomain.dimensionDomain, (t) => new BILOUNestedChunkTag(t,"O:O")) -{ - deserialize(new FileInputStream(new java.io.File("NESTEDChainChunker.factorie"))) -} - - -/* - * By Default: - * Takes conll2000 BIO tagged data as input - * Coverts to and trains on BILOU encoding - */ -object ChainChunkerTrainer extends HyperparameterMain { - def generateErrorOutput(sentence: Sentence): String ={ - val sb = new StringBuffer - sentence.tokens.map{t=>sb.append("%s %20s %10s %10s %s\n".format(if (t.attr.all[ChunkTag].head.valueIsTarget) " " else "*", t.string, t.attr[PennPosTag], t.attr.all[ChunkTag].head.target.categoryValue, t.attr.all[ChunkTag].head.categoryValue))}.mkString("\n") - } - - def evaluateParameters(args: Array[String]): Double = { - implicit val random = new scala.util.Random(0) - val opts = new ChunkerOpts - opts.parse(args) - assert(opts.trainFile.wasInvoked) - val chunk = opts.trainingEncoding.value match { - case "BILOU" => new ChainChunker[BILOUChunkTag](BILOUChunkDomain.dimensionDomain, (t) => new BILOUChunkTag(t,"O")) - case "BIO" => new ChainChunker[BIOChunkTag](BIOChunkDomain.dimensionDomain, (t) => new BIOChunkTag(t,"O")) - //Nested NP Chunker has to be trained from custom training data annotated in the NestedBILOUChunkTag domain style - case "NESTED" => new ChainChunker[BILOUNestedChunkTag](BILOUNestedChunkDomain.dimensionDomain, (t) => new BILOUNestedChunkTag(t,"O:O")) - } - - val trainDocs = LoadConll2000.fromSource(Source.fromFile(opts.trainFile.value),opts.inputEncoding.value) - val testDocs = LoadConll2000.fromSource(Source.fromFile(opts.testFile.value),opts.inputEncoding.value) - - println("Read %d training tokens.".format(trainDocs.map(_.tokenCount).sum)) - println("Read %d testing tokens.".format(testDocs.map(_.tokenCount).sum)) - - val trainPortionToTake = if(opts.trainPortion.wasInvoked) opts.trainPortion.value.toDouble else 1.0 - val testPortionToTake = if(opts.testPortion.wasInvoked) opts.testPortion.value.toDouble else 1.0 - val trainSentencesFull = trainDocs.flatMap(_.sentences).filter(!_.isEmpty) - val trainSentences = trainSentencesFull.take((trainPortionToTake*trainSentencesFull.length).floor.toInt) - val testSentencesFull = testDocs.flatMap(_.sentences).filter(!_.isEmpty) - val testSentences = testSentencesFull.take((testPortionToTake*testSentencesFull.length).floor.toInt) - - //If we want to load in BIO training data like conll2000, convert to BILOU encoding so BILOU training can be performed - if(opts.trainingEncoding.value == "BILOU" && opts.inputEncoding.value =="BIO") { - LoadConll2000.convertBIOtoBILOU(testSentences) - LoadConll2000.convertBIOtoBILOU(trainSentences) - }else{ - //Else make sure training encoding and input encoding match - if(opts.trainingEncoding.value != opts.inputEncoding.value) throw new Exception("Specified Training Encoding: " + opts.trainingEncoding.value + " does not match Document Encoding: " + opts.inputEncoding.value) - } - - chunk.train(trainSentences, testSentences, opts.useFullFeatures.value, - opts.rate.value, opts.delta.value, opts.cutoff.value, opts.updateExamples.value, opts.useHingeLoss.value, l1Factor=opts.l1.value, l2Factor=opts.l2.value) - if (opts.saveModel.value) { - chunk.serialize(new FileOutputStream(new File(opts.modelFile.value))) - println("Model Serialized") - } - val acc = HammingObjective.accuracy(testDocs.flatMap(d => d.sentences.flatMap(s => s.tokens.map(_.attr.all[ChunkTag].head)))) - if(opts.targetAccuracy.wasInvoked) assert(acc > opts.targetAccuracy.value.toDouble, "Did not reach accuracy requirement") - if(opts.errorOutput.value) { - val writer = new PrintWriter(new File("ChainChunkingOutput.txt" )) - testSentences.foreach{s=>writer.write(generateErrorOutput(s)); writer.write("")} - writer.close() - } - acc - } -} - - -object ChainChunkerOptimizer { - def main(args: Array[String]) { - val opts = new ChunkerOpts - opts.parse(args) - opts.saveModel.setValue(false) - val l1 = cc.factorie.util.HyperParameter(opts.l1, new cc.factorie.util.LogUniformDoubleSampler(1e-10, 1e2)) - val l2 = cc.factorie.util.HyperParameter(opts.l2, new cc.factorie.util.LogUniformDoubleSampler(1e-10, 1e2)) - val rate = cc.factorie.util.HyperParameter(opts.rate, new cc.factorie.util.LogUniformDoubleSampler(1e-4, 1e4)) - val delta = cc.factorie.util.HyperParameter(opts.delta, new cc.factorie.util.LogUniformDoubleSampler(1e-4, 1e4)) - val cutoff = cc.factorie.util.HyperParameter(opts.cutoff, new cc.factorie.util.SampleFromSeq(List(0,1,2,3))) - val qs = new cc.factorie.util.QSubExecutor(60, "cc.factorie.app.nlp.chunk.ChainChunkingTrainer") - val optimizer = new cc.factorie.util.HyperParameterSearcher(opts, Seq(l1, l2, rate, delta, cutoff), qs.execute, 200, 180, 60) - val result = optimizer.optimize() - println("Got results: " + result.mkString(" ")) - println("Best l1: " + opts.l1.value + " best l2: " + opts.l2.value) - opts.saveModel.setValue(true) - println("Running best configuration...") - import scala.concurrent.Await - import scala.concurrent.duration._ - Await.result(qs.execute(opts.values.flatMap(_.unParse).toArray), 5.hours) - println("Done") - } -} - -class ChunkerOpts extends cc.factorie.util.DefaultCmdOptions with SharedNLPCmdOptions{ - val conllPath = new CmdOption("rcv1Path", "../../data/conll2000", "DIR", "Path to folder containing RCV1-v2 dataset.") - val outputPath = new CmdOption("ouputPath", "../../data/conll2000/output.txt", "FILE", "Path to write output for evaluation.") - val modelFile = new CmdOption("model", "ChainChunker.factorie", "FILENAME", "Filename for the model (saving a trained model or reading a running model.") - val testFile = new CmdOption("test", "src/main/resources/test.txt", "FILENAME", "test file.") - val trainFile = new CmdOption("train", "src/main/resources/train.txt", "FILENAME", "training file.") - val l1 = new CmdOption("l1", 0.000001,"FLOAT","l1 regularization weight") - val l2 = new CmdOption("l2", 0.00001,"FLOAT","l2 regularization weight") - val rate = new CmdOption("rate", 10.0,"FLOAT","base learning rate") - val delta = new CmdOption("delta", 100.0,"FLOAT","learning rate decay") - val cutoff = new CmdOption("cutoff", 2, "INT", "Discard features less frequent than this before training.") - val updateExamples = new CmdOption("update-examples", true, "BOOL", "Whether to update examples in later iterations during training.") - val useHingeLoss = new CmdOption("use-hinge-loss", false, "BOOL", "Whether to use hinge loss (or log loss) during training.") - val saveModel = new CmdOption("save-model", false, "BOOL", "Whether to save the trained model.") - val runText = new CmdOption("run", "", "FILENAME", "Plain text file on which to run.") - val numIters = new CmdOption("num-iterations","5","INT","number of passes over the data for training") - val inputEncoding = new CmdOption("input-encoding","BIO","String","NESTED, BIO, BILOU - Encoding file used for training is in.") - val trainingEncoding = new CmdOption("train-encoding", "BILOU","String","NESTED, BIO, BILOU - labels to use during training.") - val useFullFeatures = new CmdOption("full-features", false,"BOOL", "True to use the full feature set, False to use a smaller feature set which is the default.") - val errorOutput = new CmdOption("print-output", false,"BOOL", "True to print output to file for error analysis and debugging purposes.") - -} -*/ \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/nlp/phrase/ConllEntityType.scala b/src/main/scala/cc/factorie/app/nlp/phrase/ConllEntityType.scala new file mode 100644 index 0000000..8ba2e53 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/phrase/ConllEntityType.scala @@ -0,0 +1,11 @@ +package cc.factorie.app.nlp.phrase + +import cc.factorie.app.nlp.ner.ConllNerDomain +import cc.factorie.variable.{CategoricalLabeling, CategoricalVariable} + +/** Categorical variable indicating whether the noun phrase is person, location, organization, etc. + * according to the CoNLL 2003 entity type domain: PER, ORG, LOC, MISC. */ +class ConllEntityType(targetIndex:Int) extends CategoricalVariable[String](targetIndex) with CategoricalLabeling[String] { + def this(targetCategory:String) = this(ConllNerDomain.index(targetCategory)) + def domain = ConllNerDomain +} \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/nlp/phrase/ConllPhraseEntityType.scala b/src/main/scala/cc/factorie/app/nlp/phrase/ConllPhraseEntityType.scala new file mode 100644 index 0000000..4fa5444 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/phrase/ConllPhraseEntityType.scala @@ -0,0 +1,3 @@ +package cc.factorie.app.nlp.phrase + +class ConllPhraseEntityType(val phrase:Phrase, targetValue:String) extends ConllEntityType(targetValue) \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/nlp/phrase/ConllPhraseFinder.scala b/src/main/scala/cc/factorie/app/nlp/phrase/ConllPhraseFinder.scala new file mode 100644 index 0000000..46232ef --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/phrase/ConllPhraseFinder.scala @@ -0,0 +1,6 @@ +package cc.factorie.app.nlp.phrase + +import cc.factorie.app.nlp.coref.NerPhraseFinder +import cc.factorie.app.nlp.ner.ConllNerSpan + +object ConllPhraseFinder extends NerPhraseFinder[ConllNerSpan] diff --git a/src/main/scala/cc/factorie/app/nlp/phrase/DatePhrase.scala b/src/main/scala/cc/factorie/app/nlp/phrase/DatePhrase.scala new file mode 100644 index 0000000..ea474bb --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/phrase/DatePhrase.scala @@ -0,0 +1,27 @@ +package cc.factorie.app.nlp.phrase + +import java.util.GregorianCalendar + +import cc.factorie.app.nlp.Token + +/** + * Created by andrew@andrewresearch.net on 28/10/17. + */ + +class DatePhrase(startToken: Token, length: Int = 1, val day: Int = -1, val month: Int = -1, val year: Int = Int.MinValue, val weekDay: Int = -1) + extends Phrase(startToken.section, startToken.positionInSection, length, 0) { + + def toJavaDate: java.util.Date = new GregorianCalendar(year, month, day).getTime + + override def toString: String = { + var s = "" + if (weekDay >= 0) s += DatePhraseFinder.nrToWeekDay(weekDay) + ", " + if (day >= 0) s += day + " " + if (month >= 0) s += DatePhraseFinder.nrToMonth(month - 1) + " " + if (year >= 0) s += year + s.trim + } + + def toLocatedDate = LocatedDate(toJavaDate, this.document.name, characterOffsets._1, characterOffsets._2) +} + diff --git a/src/main/scala/cc/factorie/app/nlp/phrase/DatePhraseFinder.scala b/src/main/scala/cc/factorie/app/nlp/phrase/DatePhraseFinder.scala index aa56525..a05a5eb 100644 --- a/src/main/scala/cc/factorie/app/nlp/phrase/DatePhraseFinder.scala +++ b/src/main/scala/cc/factorie/app/nlp/phrase/DatePhraseFinder.scala @@ -12,20 +12,17 @@ limitations under the License. */ package cc.factorie.app.nlp.phrase -import java.util.GregorianCalendar - import cc.factorie._ -import cc.factorie.app.nlp._ import cc.factorie.app.nlp.lemma.TokenLemma import cc.factorie.app.nlp.pos.PennPosTag +import cc.factorie.app.nlp.{Document, DocumentAnnotator, Token} import scala.collection.mutable.ArrayBuffer +import scala.language.implicitConversions import scala.util.parsing.combinator.{ImplicitConversions, Parsers} import scala.util.parsing.input.{Position, Reader} -import scala.language.implicitConversions -/** A collection of Phrases that are noun phrases. Typically used as an attribute of a Section or a Document. */ -class DatePhraseList(phrases: Iterable[DatePhrase]) extends PhraseList(phrases) + /** * Finds and parses all kinds of dates in a document, Basic formats were taken from http://en.wikipedia.org/wiki/Calendar_date. @@ -230,21 +227,4 @@ class DatePhraseFinder(usePosTag:Boolean) extends DocumentAnnotator with Parsers override def tokenAnnotationString(token: Token): String = token.document.attr[DatePhraseList].find(phrase => phrase.contains(token)).fold("")("Date: " + _.asInstanceOf[DatePhrase].toString()) } -class DatePhrase(startToken: Token, length: Int = 1, val day: Int = -1, val month: Int = -1, val year: Int = Int.MinValue, val weekDay: Int = -1) - extends Phrase(startToken.section, startToken.positionInSection, length, 0) { - - def toJavaDate: java.util.Date = new GregorianCalendar(year, month, day).getTime - - override def toString: String = { - var s = "" - if (weekDay >= 0) s += DatePhraseFinder.nrToWeekDay(weekDay) + ", " - if (day >= 0) s += day + " " - if (month >= 0) s += DatePhraseFinder.nrToMonth(month - 1) + " " - if (year >= 0) s += year - s.trim - } - - def toLocatedDate = LocatedDate(toJavaDate, this.document.name, characterOffsets._1, characterOffsets._2) -} -case class LocatedDate(date:java.util.Date, docId:String, startOffset:Int, endOffset:Int) \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/nlp/phrase/DatePhraseList.scala b/src/main/scala/cc/factorie/app/nlp/phrase/DatePhraseList.scala new file mode 100644 index 0000000..aac2c78 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/phrase/DatePhraseList.scala @@ -0,0 +1,4 @@ +package cc.factorie.app.nlp.phrase + +/** A collection of Phrases that are noun phrases. Typically used as an attribute of a Section or a Document. */ +class DatePhraseList(phrases: Iterable[DatePhrase]) extends PhraseList(phrases) \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/nlp/phrase/Gender.scala b/src/main/scala/cc/factorie/app/nlp/phrase/Gender.scala new file mode 100644 index 0000000..0cce8f2 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/phrase/Gender.scala @@ -0,0 +1,8 @@ +package cc.factorie.app.nlp.phrase + +import cc.factorie.variable.CategoricalVariable + +class Gender(categoryIndex:Int) extends CategoricalVariable[String](categoryIndex) { + def this(category:String) = this(GenderDomain.index(category)) + final def domain = GenderDomain +} diff --git a/src/main/scala/cc/factorie/app/nlp/phrase/GenderDomain.scala b/src/main/scala/cc/factorie/app/nlp/phrase/GenderDomain.scala new file mode 100644 index 0000000..cbb6e1f --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/phrase/GenderDomain.scala @@ -0,0 +1,12 @@ +package cc.factorie.app.nlp.phrase + +import cc.factorie.variable.EnumDomain + +object GenderDomain extends EnumDomain { + val UNKNOWN, // uncertain + NEUTER, // known to be non-person + PERSON, // person, but uncertain about gender + MALE, // male person + FEMALE = Value // female person + freeze() +} diff --git a/src/main/scala/cc/factorie/app/nlp/phrase/HeadTokenOffset.scala b/src/main/scala/cc/factorie/app/nlp/phrase/HeadTokenOffset.scala new file mode 100644 index 0000000..e5c72ba --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/phrase/HeadTokenOffset.scala @@ -0,0 +1,43 @@ +package cc.factorie.app.nlp.phrase + +import cc.factorie.app.nlp.lexicon.Preposition +import cc.factorie.app.nlp.pos.PennPosTag + +/** A heuristic for selecting the head of a phrase. + *If a parse is available, use it to find the head; if a preposition is found, select the word before it; otherwise simply select the last token. */ +object HeadTokenOffset { + def apply(phrase:Phrase): Int = { + if (phrase.length == 1) return 0 + val span = phrase.value + val sentence = phrase.sentence + // If there is a parse, then traverse up the tree until just before we exit the Span + val parse = sentence.parse + if (parse ne null) { + var headSentenceIndex = math.min(span.end, sentence.end)-1 - sentence.start + var parentSentenceIndex = parse.parentIndex(headSentenceIndex) + while (span.contains(parentSentenceIndex + sentence.start)) { + headSentenceIndex = parentSentenceIndex + parentSentenceIndex = parse.parentIndex(parentSentenceIndex) + } + //Sometimes phrases are broken, consisting of more than one subgraph in the parse tree; check if parent of exit is not again part of mention + if(parentSentenceIndex >= 0) { + parentSentenceIndex = parse.parentIndex(parentSentenceIndex) + while (span.contains(parentSentenceIndex + sentence.start)) { + headSentenceIndex = parentSentenceIndex + parentSentenceIndex = parse.parentIndex(parentSentenceIndex) + } + } + return headSentenceIndex + sentence.start - span.start + } else { + // If there is a preposition, select the word just before the first preposition + val prepositionIndex = span.indexWhere(Preposition.contains(_)) + if (prepositionIndex >= 1) return prepositionIndex - 1 + // If there is noun, return the last noun + val lastNounIndex = span.lastIndexWhere(_.attr[PennPosTag].isNoun) + if (lastNounIndex > 0) return lastNounIndex + // Otherwise simply select the last word of the span + else return span.length-1 + + } + } +} diff --git a/src/main/scala/cc/factorie/app/nlp/phrase/LocatedDate.scala b/src/main/scala/cc/factorie/app/nlp/phrase/LocatedDate.scala new file mode 100644 index 0000000..eedefea --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/phrase/LocatedDate.scala @@ -0,0 +1,3 @@ +package cc.factorie.app.nlp.phrase + +case class LocatedDate(date:java.util.Date, docId:String, startOffset:Int, endOffset:Int) diff --git a/src/main/scala/cc/factorie/app/nlp/phrase/MentionPhraseNumberLabeler.scala b/src/main/scala/cc/factorie/app/nlp/phrase/MentionPhraseNumberLabeler.scala new file mode 100644 index 0000000..76cbd5d --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/phrase/MentionPhraseNumberLabeler.scala @@ -0,0 +1,6 @@ +package cc.factorie.app.nlp.phrase + +import cc.factorie.app.nlp.coref.WithinDocCoref + +//class MentionPhraseNumberLabeler extends PhraseNumberLabeler[WithinDocCoref](_.mentions.map(_.phrase)) +object MentionPhraseNumberLabeler extends NounPhraseNumberLabeler[WithinDocCoref](_.mentions.map(_.phrase)) diff --git a/src/main/scala/cc/factorie/app/nlp/phrase/NPChunkMentionFinder.scala b/src/main/scala/cc/factorie/app/nlp/phrase/NPChunkMentionFinder.scala deleted file mode 100644 index e578540..0000000 --- a/src/main/scala/cc/factorie/app/nlp/phrase/NPChunkMentionFinder.scala +++ /dev/null @@ -1,92 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - - -package cc.factorie.app.nlp.phrase - -import cc.factorie.app.nlp._ -import cc.factorie.app.nlp.load.{BILOUChunkTag, BILOUNestedChunkTag, ChunkTag} - -import scala.collection.mutable.ListBuffer -import scala.reflect.ClassTag - -/** User: cellier - * Date: 10/28/13 - * Time: 11:24 PM - */ - -/** Object to retrieve two layers of Nested BILOU Tags*/ -object NestedNPChunkPhraseFinder extends NPChunkPhraseFinder[BILOUNestedChunkTag]{ - //Splits tag value and calls to retrieve NPs for the inner tags and outer tags - override def getMentionSpans(document: Document): Seq[TokenSpan] ={ - val mentionSpans = ListBuffer[TokenSpan]() - document.sentences.foreach{s=> - val chunkTags = s.tokens.map(t => t.attr[BILOUNestedChunkTag].categoryValue.split(":").map(layer => t -> layer)).map(layer => (layer(0),layer(1))) - val (innerTags,outerTags) = chunkTags.unzip - //splitting up the tags into the inner and outer tags and grabbing noun span separately seemed like the safest option - //but might not be the fastest - mentionSpans ++= getNPChunkSpans(s,innerTags) - mentionSpans ++= getNPChunkSpans(s,outerTags) - } - mentionSpans.seq - } -} -//Default for MentionFinder is BILOU Notation over BIO since BILOU performed best for NP mention finding -object NPChunkMentionFinder extends NPChunkPhraseFinder[BILOUChunkTag] - -class NPChunkPhraseFinder[L<:ChunkTag](implicit m: ClassTag[L]) extends DocumentAnnotator { - def prereqAttrs = Seq(classOf[Token], classOf[Sentence], m.runtimeClass) - def postAttrs = Seq(classOf[NounPhraseList]) - override def tokenAnnotationString(token:Token): String = token.document.attr[PhraseList].filter(phrase => phrase.contains(token)) match { case phraseSeq:Seq[Phrase] if phraseSeq.length > 0 => phraseSeq.map(phrase => phrase.attr[NounPhraseType].categoryValue+":"+ phrase.attr[OntonotesPhraseEntityType].categoryValue +":" +phrase.indexOf(token)).mkString(","); case _ => "_" } - - val upperCase = "[A-Z]+".r - - def process(document: Document) = { - val phrases = getChunkPhrases(document) - document.attr += new NounPhraseList(phrases.sortBy(phrase => (phrase.head.stringStart, phrase.length))) - document - } - - def getChunkPhrases(document: Document): Seq[Phrase] = { - getMentionSpans(document).map(span => new Phrase(span))//Get the head from the phrase's heuristic labeler - } - - def getMentionSpans(document: Document): Seq[TokenSpan] ={ - val mentionSpans = ListBuffer[TokenSpan]() - document.sentences.foreach{s=> - val chunkTags = s.tokens.map(t => t-> t.attr[BILOUChunkTag].categoryValue) - mentionSpans ++= getNPChunkSpans(s,chunkTags) - } - mentionSpans.seq - } - - def getNPChunkSpans(s: Sentence,chunkTags: IndexedSeq[(Token, String)]):Seq[TokenSpan]={ - val spans = ListBuffer[TokenSpan]() - chunkTags.map{case (t,chunk) => - if (chunk != "O") { - if(chunk == "U-NP") spans += new TokenSpan(s.section, t.positionInSection, 1) - else if(chunk == "B-NP"){ - if(t.hasNext) { - var lookFor = t.next - while (lookFor.hasNext && lookFor.sentence == t.sentence && chunkTags(chunkTags.map(_._1.string).indexOf(lookFor.string))._2.matches("(I|L)-NP")) lookFor = lookFor.next - spans += new TokenSpan(s.section, t.positionInSection, lookFor.positionInSection - t.positionInSection) - } else spans += new TokenSpan(s.section, t.positionInSection, 1) - } - } - } - spans.toSeq - } -} - - - diff --git a/src/main/scala/cc/factorie/app/nlp/phrase/NnpPosNounPhraseFinder.scala b/src/main/scala/cc/factorie/app/nlp/phrase/NnpPosNounPhraseFinder.scala new file mode 100644 index 0000000..6de2187 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/phrase/NnpPosNounPhraseFinder.scala @@ -0,0 +1,33 @@ +package cc.factorie.app.nlp.phrase + +import cc.factorie.app.nlp.Document +import cc.factorie.app.nlp.coref.MentionPhraseFinder +import cc.factorie.app.nlp.pos.{PennPosDomain, PennPosTag} + +import scala.collection.mutable + +/** Apply returns a list of NNP-indicated proper noun phrases, given PennPosTags. + * + * @author Andrew McCallum */ +object NnpPosNounPhraseFinder extends MentionPhraseFinder { + def prereqAttrs = Seq(classOf[PennPosTag]) + def apply(doc:Document): Seq[Phrase] = { + val result = new mutable.ArrayBuffer[Phrase] + var start = 0 + for (section <- doc.sections) { + val tokens = section.tokens + while (start < tokens.length) { + val token = tokens(start) + var end = start + while (end < tokens.length && tokens(end).posTag.intValue == PennPosDomain.nnpIndex) end += 1 + if (end != start && tokens(end-1).posTag.intValue == PennPosDomain.nnpIndex) { + val phrase = new Phrase(section, token.positionInSection, length=end-start,offsetToHeadToken = -1) + phrase.attr += new NounPhraseType(phrase, "NAM") + NounPhraseEntityTypeLabeler.process(phrase) + } + start = math.max(start+1, end) + } + } + result + } +} \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/nlp/phrase/NounPhraseEntityTypeLabeler.scala b/src/main/scala/cc/factorie/app/nlp/phrase/NounPhraseEntityTypeLabeler.scala new file mode 100644 index 0000000..c884cb2 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/phrase/NounPhraseEntityTypeLabeler.scala @@ -0,0 +1,5 @@ +package cc.factorie.app.nlp.phrase + +import cc.factorie.util.ClasspathURL + +object NounPhraseEntityTypeLabeler extends OntonotesPhraseEntityTypeLabeler(ClasspathURL[OntonotesPhraseEntityTypeLabeler](".factorie").openConnection().getInputStream) diff --git a/src/main/scala/cc/factorie/app/nlp/phrase/NounPhraseGender.scala b/src/main/scala/cc/factorie/app/nlp/phrase/NounPhraseGender.scala index 8cf4ba2..df1e9cb 100644 --- a/src/main/scala/cc/factorie/app/nlp/phrase/NounPhraseGender.scala +++ b/src/main/scala/cc/factorie/app/nlp/phrase/NounPhraseGender.scala @@ -12,184 +12,8 @@ limitations under the License. */ package cc.factorie.app.nlp.phrase -import cc.factorie.app.nlp._ -import cc.factorie.app.nlp.lexicon.{LexiconsProvider, StaticLexicons} -import cc.factorie.app.nlp.coref.{Mention, PronounSets} -import cc.factorie.variable.{CategoricalVariable, EnumDomain} +import cc.factorie.app.nlp.coref.Mention -import scala.reflect.ClassTag - -object GenderDomain extends EnumDomain { - val UNKNOWN, // uncertain - NEUTER, // known to be non-person - PERSON, // person, but uncertain about gender - MALE, // male person - FEMALE = Value // female person - freeze() -} - -class Gender(categoryIndex:Int) extends CategoricalVariable[String](categoryIndex) { - def this(category:String) = this(GenderDomain.index(category)) - final def domain = GenderDomain -} -class PhraseGender(val phrase:Phrase, categoryIndex:Int) extends Gender(categoryIndex) { - def this(phrase:Phrase, category:String) = this(phrase, GenderDomain.index(category)) -} - - -/** Cheap gender predictor based on rules and lexicons. */ -class PhraseGenderLabeler[A<:AnyRef](documentAttrToPhrases:(A)=>Iterable[Phrase])(implicit docAttrClass:ClassTag[A]) extends DocumentAnnotator { - - // todo fix this - @deprecated("This exists to preserve prior behavior, it should be a constructor argument", "10/5/15") - val lexicon = new StaticLexicons()(LexiconsProvider.classpath()) - - def process(document:Document): Document = { - for (phrase <- documentAttrToPhrases(document.attr[A])) process(phrase) - document - } - def process(phrase:Phrase): Unit = { - import GenderDomain._ - val gender = new PhraseGender(phrase, UNKNOWN) - phrase.attr += gender - if (phrase.length > 0) { - val genderFromLexicon = lexiconGender(phrase) - if (genderFromLexicon.isDefined) gender := genderFromLexicon.get - else { - val firstWord = phrase(0).string.toLowerCase - val lastWord = phrase.last.string.toLowerCase - var firstName = firstWord - if (lexicon.iesl.PersonHonorific.containsWord(firstWord)) { - gender := PERSON - if (maleHonors.contains(firstWord)) gender := MALE - else if (femaleHonors.contains(firstWord)) gender := FEMALE - if (phrase.length >= 3) firstName = phrase(1).string.toLowerCase - } - if (gender.intValue != MALE && gender.intValue != FEMALE) { - if (lexicon.iesl.Month.containsWord(firstWord)) gender := NEUTER - else if (lexicon.uscensus.PersonFirstMale.containsWord(firstName)) gender := MALE - else if (lexicon.uscensus.PersonFirstFemale.containsWord(firstName) && firstName != "an") gender := FEMALE - else if (gender.intValue == GenderDomain.UNKNOWN && lexicon.iesl.PersonLast.containsWord(lastWord)) gender := PERSON - if (lexicon.iesl.City.contains(phrase) || lexicon.iesl.Country.contains(phrase) || lexicon.iesl.OrgSuffix.containsWord(lastWord)) - if (gender.intValue == UNKNOWN) gender := NEUTER else gender := UNKNOWN // Could be either person or other; mark it unknown - } - } - } - } - - /** Test various words in the phrase to see if they indicate gender. Return an index into the NounPhraseGenderDomain. */ - def lexiconGender(phrase: Phrase): Option[Int] = { - if (phrase.length == 1) lexiconGender(phrase.tokens(0).string) - else if (phrase.length == 2) lexiconGender(phrase.tokens(0).string).orElse(lexiconGender(phrase.tokens(1).string)) - else lexiconGender(phrase.headToken.string).orElse(lexiconGender(phrase.tokens(0).string).orElse(lexiconGender(phrase.tokens(1).string))) - } - def lexiconGender(word:String): Option[Int] = { - val lemma = word.toLowerCase - if (maleWords.contains(lemma)) Some(GenderDomain.MALE) - else if (femaleWords.contains(lemma)) Some(GenderDomain.FEMALE) - else if (PronounSets.neuter.contains(lemma)) Some(GenderDomain.NEUTER) - else if (PronounSets.allPersonPronouns.contains(lemma)) Some(GenderDomain.PERSON) - else None -} - - //since lemmaString is singular, we don't need to hard code in the plural form of these words - val maleHonors = Set("mr.", "mr", "mister") - val femaleHonors = Set("ms.", "ms", "mrs.", "mrs", "miss", "misses") - - val maleFemaleWords = Seq( - ("", "actress"), - ("", "adulteress"), - ("", "giantess"), - ("", "heiress"), - ("", "hostess"), - ("", "poetess"), - ("", "shepherdess"), - ("baron", "baroness"), - ("boar", "sow"), - ("boy", "girl"), - ("boy-friend", "girl-friend"), - ("boyfriend", "girlfriend"), - ("bridegroom", "bride"), - ("bro", "sis"), - ("brother", "sister"), - ("brother-in-law", "sister-in-law"), - ("buck", "roe"), - ("bull", "cow"), - ("chap", ""), - ("cock", "hen"), - ("codger", ""), - ("count", "countess"), - ("dad", "mom"), - ("dad", "mum"), - ("daddy", "mommy"), - ("deacon", "deaconess"), - ("dude", "dame"), - ("duke", "duchess"), - ("emperor", "empress"), - ("father", "mother"), - ("father-in-law", "mother-in-law"), - ("fiance", "fiancee"), - ("fianc\u00E9", "fianc\u00E9e"), - ("gigolo", "prostitute"), - ("godfather", "godmother"), - ("godson", "goddaughter"), - ("grandfather", "grandmother"), - ("grandpa", "grandma"), - ("grandson", "granddaughter"), - ("guy", "gal"), - ("he", "she"), - ("hero", "heroine"), - ("him", "her"), - ("his", "hers"), - ("husband", "wife"), - ("king", "queen"), - ("lad", "lass"), - ("landlord", "landlady"), - ("lion", "lioness"), - ("lord", "lady"), - ("male", "female"), - ("man", "woman"), - ("manservant", "maidservant"), - ("master", "mistress"), - ("men", "women"), - ("monk", "nun"), - ("nephew", "niece"), - ("pa", "ma"), - ("papa", "mama"), - ("papa", "mamma"), - ("papa", "momma"), - ("peacock", "peahen"), - ("pop", "mom"), - ("pope", ""), - ("priest", "priestess"), - ("prince", "princess"), - ("ram", "ewe"), - ("sir", "madam"), - ("sir", "ma'am"), - ("son-in-law", "daughter-in-law"), - ("stallion", "mare"), - ("step-father", "step-mother"), - ("step-son", "step-daughter"), - ("steward", "stewardess"), - ("tiger", "tigress"), - ("tom", "tib"), // cat or elephant - ("uncle", "aunt"), - ("waiter", "waitress"), - ("widower", "widow") - ) - val maleWords = maleFemaleWords.map(_._1).filter(_.length > 0).toSet - val femaleWords = maleFemaleWords.map(_._2).filter(_.length > 0).toSet - - override def tokenAnnotationString(token:Token): String = { val phrases = documentAttrToPhrases(token.document.attr[A]).filter(_.contains(token)); phrases.map(_.attr[Gender].categoryValue).mkString(",") } - override def phraseAnnotationString(phrase:Phrase): String = { val t = phrase.attr[Gender]; if (t ne null) t.categoryValue else "_" } - def prereqAttrs: Iterable[Class[_]] = List(docAttrClass.runtimeClass)//Require some TokenSpanList containing subclass of Phrase elements - // Note that this postAttr doesn't indicate if all Phrases or just some Mention Phrases were actually labeled. - def postAttrs: Iterable[Class[_]] = List(classOf[PhraseGender]) -} - -/** Gender label all phrases in the Document's NounPhraseList. */ -class NounPhraseGenderLabeler extends PhraseGenderLabeler[NounPhraseList](phrase=>phrase) -object NounPhraseGenderLabeler extends NounPhraseGenderLabeler /** Gender label phrases of all Mentions in the Document's MentionList. */ class MentionPhraseGenderLabeler extends PhraseGenderLabeler[Seq[Mention]](mentions =>mentions.map(_.phrase)) diff --git a/src/main/scala/cc/factorie/app/nlp/phrase/NounPhraseGenderLabeler.scala b/src/main/scala/cc/factorie/app/nlp/phrase/NounPhraseGenderLabeler.scala new file mode 100644 index 0000000..dce1ad9 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/phrase/NounPhraseGenderLabeler.scala @@ -0,0 +1,5 @@ +package cc.factorie.app.nlp.phrase + +/** Gender label all phrases in the Document's NounPhraseList. */ +class NounPhraseGenderLabeler extends PhraseGenderLabeler[NounPhraseList](phrase=>phrase) +object NounPhraseGenderLabeler extends NounPhraseGenderLabeler diff --git a/src/main/scala/cc/factorie/app/nlp/phrase/NounPhraseList.scala b/src/main/scala/cc/factorie/app/nlp/phrase/NounPhraseList.scala new file mode 100644 index 0000000..1819f66 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/phrase/NounPhraseList.scala @@ -0,0 +1,4 @@ +package cc.factorie.app.nlp.phrase + +/** A collection of Phrases that are noun phrases. Typically used as an attribute of a Section or a Document. */ +class NounPhraseList(phrases:Iterable[Phrase]) extends PhraseList(phrases) diff --git a/src/main/scala/cc/factorie/app/nlp/phrase/NounPhraseNumber.scala b/src/main/scala/cc/factorie/app/nlp/phrase/NounPhraseNumberLabeler.scala similarity index 58% rename from src/main/scala/cc/factorie/app/nlp/phrase/NounPhraseNumber.scala rename to src/main/scala/cc/factorie/app/nlp/phrase/NounPhraseNumberLabeler.scala index 12c743d..699da1c 100644 --- a/src/main/scala/cc/factorie/app/nlp/phrase/NounPhraseNumber.scala +++ b/src/main/scala/cc/factorie/app/nlp/phrase/NounPhraseNumberLabeler.scala @@ -1,42 +1,12 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ package cc.factorie.app.nlp.phrase -import cc.factorie.app.nlp._ -import cc.factorie.app.nlp.coref.WithinDocCoref import cc.factorie.app.nlp.morph.BasicMorphologicalAnalyzer -import cc.factorie.app.nlp.pos._ -import cc.factorie.variable.{CategoricalVariable, EnumDomain} +import cc.factorie.app.nlp.pos.PennPosTag +import cc.factorie.app.nlp.{Document, DocumentAnnotator, Token} import scala.reflect.ClassTag -object NumberDomain extends EnumDomain { - val UNKNOWN, // uncertain - SINGULAR, // one of something - PLURAL = Value // multiple of something - freeze() -} - -class Number extends CategoricalVariable[String] { - def this(value:String) = { this(); _initialize(domain.index(value)) } - def this(value:Int) = { this(); _initialize(value) } - def domain = NumberDomain -} -class PhraseNumber(val phrase:Phrase, value:Int) extends Number(value) { - def this(phrase:Phrase, value:String) = this(phrase, NumberDomain.index(value)) -} - -/** Cheap number predictor based on rules and lexicons. Really this should use a real morphological analyzer. */ +/** Cheap number predictor based on rules and lexicon. Really this should use a real morphological analyzer. */ class NounPhraseNumberLabeler[A<:AnyRef](documentAttrToPhrases:(A)=>Iterable[Phrase])(implicit docAttrClass:ClassTag[A]) extends DocumentAnnotator { val singularPronoun = Set("i", "me", "my", "mine", "myself", "he", "she", "it", "him", "her", "his", "hers", "its", "one", "ones", "oneself", "this", "that") val pluralPronoun = Set("we", "us", "our", "ours", "ourselves", "ourself", "they", "them", "their", "theirs", "themselves", "themself", "these", "those") @@ -75,9 +45,3 @@ class NounPhraseNumberLabeler[A<:AnyRef](documentAttrToPhrases:(A)=>Iterable[Phr //class NounPhraseNumberLabeler extends PhraseNumberLabeler[NounPhraseList](phrases => phrases) object NounPhraseNumberLabeler extends NounPhraseNumberLabeler[NounPhraseList](phrases => phrases) - -//class MentionPhraseNumberLabeler extends PhraseNumberLabeler[WithinDocCoref](_.mentions.map(_.phrase)) -object MentionPhraseNumberLabeler extends NounPhraseNumberLabeler[WithinDocCoref](_.mentions.map(_.phrase)) - -// No reason to have this. Label should always go on Phrase, not mention. -akm -//object MentionNumberLabeler extends NumberLabeler[Mention,MentionList] diff --git a/src/main/scala/cc/factorie/app/nlp/phrase/NounPhraseType.scala b/src/main/scala/cc/factorie/app/nlp/phrase/NounPhraseType.scala index 7af9d79..14bf1cd 100644 --- a/src/main/scala/cc/factorie/app/nlp/phrase/NounPhraseType.scala +++ b/src/main/scala/cc/factorie/app/nlp/phrase/NounPhraseType.scala @@ -1,52 +1,12 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - package cc.factorie.app.nlp.phrase -import cc.factorie.app.nlp.Token -import cc.factorie.variable.{CategoricalDomain, CategoricalVariable} + +import cc.factorie.variable.CategoricalVariable + /** Categorical variable indicating whether the noun phrase is a pronoun, common noun phrase or proper noun phrase. - (In earlier versions this was called "MentionType", but it really is an attribute of the Phrase.) - @author Andrew McCallum */ + * (In earlier versions this was called "MentionType", but it really is an attribute of the Phrase.) + * + * @author Andrew McCallum */ class NounPhraseType(val phrase:Phrase, targetValue:String) extends CategoricalVariable(targetValue) { def domain = NounPhraseTypeDomain } - -/** Categorical domain indicating whether the noun phrase is a pronoun, common noun phrase or proper noun phrase. - @author Andrew McCallum */ -object NounPhraseTypeDomain extends CategoricalDomain(List("PRO", "NOM", "NAM")) // TODO consider renaming these to "PRONOUN", "COMMON", "PROPER". -akm - - -/** A weak rule-based predictor of noun phrase type. */ -object DeterministicNounPhraseTypeLabeler { - private final val PERSONAL_PRONOUNS = Seq("PRP", "PRP$") - private final val COMMON_NOUNS = Seq("NN" , "NNS") - private final val PROPER_NOUNS = Seq("NNP", "NNPS") - private final val ALL_NOUNS = Seq("NN","NNS","NNP","NNPS","PRP","PRP$") - - private def isPersonalPronoun(t: Token) = PERSONAL_PRONOUNS.contains(t.posTag.categoryValue.toUpperCase) - private def isCommonNoun (t: Token) = COMMON_NOUNS.contains(t.posTag.categoryValue.toUpperCase) - private def isProperNoun (t: Token) = PROPER_NOUNS.contains(t.posTag.categoryValue.toUpperCase) - private def isNoun (t: Token) = ALL_NOUNS.contains(t.posTag.categoryValue.toUpperCase) - - def process(phrase:Phrase): Unit = { - if (phrase.attr[NounPhraseType] ne null) return - - val nounType = - if(isPersonalPronoun(phrase.headToken)) "PRO" - else if(isCommonNoun(phrase.headToken)) "NOM" - else if(isProperNoun(phrase.headToken)) "NAM" - else "NOM" - phrase.attr += new NounPhraseType(phrase,nounType) - } -} \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/nlp/phrase/NounPhraseTypeDomain.scala b/src/main/scala/cc/factorie/app/nlp/phrase/NounPhraseTypeDomain.scala new file mode 100644 index 0000000..fed8c61 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/phrase/NounPhraseTypeDomain.scala @@ -0,0 +1,8 @@ +package cc.factorie.app.nlp.phrase + +import cc.factorie.variable.CategoricalDomain + +/** Categorical domain indicating whether the noun phrase is a pronoun, common noun phrase or proper noun phrase. + * + * @author Andrew McCallum */ +object NounPhraseTypeDomain extends CategoricalDomain(List("PRO", "NOM", "NAM")) // TODO consider renaming these to "PRONOUN", "COMMON", "PROPER". -akm diff --git a/src/main/scala/cc/factorie/app/nlp/phrase/Number.scala b/src/main/scala/cc/factorie/app/nlp/phrase/Number.scala new file mode 100644 index 0000000..49a4ca6 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/phrase/Number.scala @@ -0,0 +1,9 @@ +package cc.factorie.app.nlp.phrase + +import cc.factorie.variable.CategoricalVariable + +class Number extends CategoricalVariable[String] { + def this(value:String) = { this(); _initialize(domain.index(value)) } + def this(value:Int) = { this(); _initialize(value) } + def domain = NumberDomain +} diff --git a/src/main/scala/cc/factorie/app/nlp/phrase/NumberDomain.scala b/src/main/scala/cc/factorie/app/nlp/phrase/NumberDomain.scala new file mode 100644 index 0000000..b499785 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/phrase/NumberDomain.scala @@ -0,0 +1,10 @@ +package cc.factorie.app.nlp.phrase + +import cc.factorie.variable.EnumDomain + +object NumberDomain extends EnumDomain { + val UNKNOWN, // uncertain + SINGULAR, // one of something + PLURAL = Value // multiple of something + freeze() +} diff --git a/src/main/scala/cc/factorie/app/nlp/phrase/OntonotesEntityType.scala b/src/main/scala/cc/factorie/app/nlp/phrase/OntonotesEntityType.scala new file mode 100644 index 0000000..be96b05 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/phrase/OntonotesEntityType.scala @@ -0,0 +1,11 @@ +package cc.factorie.app.nlp.phrase + +import cc.factorie.app.nlp.ner.OntonotesEntityTypeDomain +import cc.factorie.variable.LabeledCategoricalVariable + +/** Categorical variable indicating whether the noun phrase is person, location, organization, etc. + * according to the Ontonotes entity type domain. */ +class OntonotesEntityType(targetValue:String, val exactMatch:Boolean = false) extends LabeledCategoricalVariable(targetValue) { + def domain = OntonotesEntityTypeDomain + +} diff --git a/src/main/scala/cc/factorie/app/nlp/phrase/OntonotesPhraseEntityType.scala b/src/main/scala/cc/factorie/app/nlp/phrase/OntonotesPhraseEntityType.scala new file mode 100644 index 0000000..da4fd5c --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/phrase/OntonotesPhraseEntityType.scala @@ -0,0 +1,3 @@ +package cc.factorie.app.nlp.phrase + +class OntonotesPhraseEntityType(val phrase:Phrase, targetValue:String, exactMatch:Boolean = false) extends OntonotesEntityType(targetValue,exactMatch) diff --git a/src/main/scala/cc/factorie/app/nlp/phrase/NounPhraseEntityType.scala b/src/main/scala/cc/factorie/app/nlp/phrase/OntonotesPhraseEntityTypeLabeler.scala similarity index 61% rename from src/main/scala/cc/factorie/app/nlp/phrase/NounPhraseEntityType.scala rename to src/main/scala/cc/factorie/app/nlp/phrase/OntonotesPhraseEntityTypeLabeler.scala index 5742fea..a27dc66 100644 --- a/src/main/scala/cc/factorie/app/nlp/phrase/NounPhraseEntityType.scala +++ b/src/main/scala/cc/factorie/app/nlp/phrase/OntonotesPhraseEntityTypeLabeler.scala @@ -1,48 +1,15 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ package cc.factorie.app.nlp.phrase -import cc.factorie.app.nlp.lexicon._ import java.io._ -import cc.factorie._ import cc.factorie.app.classify.backend.LinearMulticlassClassifier -import cc.factorie.app.nlp._ -import cc.factorie.app.nlp.load.LoadConll2011 -import cc.factorie.app.nlp.ner.{ConllNerDomain, OntonotesEntityTypeDomain} +import cc.factorie.app.nlp.lexicon._ +import cc.factorie.app.nlp.ner.OntonotesEntityTypeDomain +import cc.factorie.app.nlp.{Document, DocumentAnnotator, Token, wordnet} +import cc.factorie.la import cc.factorie.optimize.{OptimizableObjectives, PredictorExample, Trainer} -import cc.factorie.util.{BinarySerializer, ClasspathURL} -import cc.factorie.variable.{BinaryFeatureVectorVariable, CategoricalLabeling, CategoricalVariable, CategoricalVectorDomain, LabeledCategoricalVariable} - - - -/** Categorical variable indicating whether the noun phrase is person, location, organization, etc. - according to the CoNLL 2003 entity type domain: PER, ORG, LOC, MISC. */ -class ConllEntityType(targetIndex:Int) extends CategoricalVariable[String](targetIndex) with CategoricalLabeling[String] { - def this(targetCategory:String) = this(ConllNerDomain.index(targetCategory)) - def domain = ConllNerDomain -} - -class ConllPhraseEntityType(val phrase:Phrase, targetValue:String) extends ConllEntityType(targetValue) - -/** Categorical variable indicating whether the noun phrase is person, location, organization, etc. - according to the Ontonotes entity type domain. */ -class OntonotesEntityType(targetValue:String, val exactMatch:Boolean = false) extends LabeledCategoricalVariable(targetValue) { - def domain = OntonotesEntityTypeDomain - -} - -class OntonotesPhraseEntityType(val phrase:Phrase, targetValue:String, exactMatch:Boolean = false) extends OntonotesEntityType(targetValue,exactMatch) +import cc.factorie.util.BinarySerializer +import cc.factorie.variable.{BinaryFeatureVectorVariable, CategoricalVectorDomain} class OntonotesPhraseEntityTypeLabeler extends DocumentAnnotator { def this(stream:InputStream) = { this(); deserialize(stream) } @@ -68,13 +35,13 @@ class OntonotesPhraseEntityTypeLabeler extends DocumentAnnotator { def domain = FeatureDomain } lazy val model = new LinearMulticlassClassifier(OntonotesEntityTypeDomain.size, FeatureDomain.dimensionDomain.size) - + def features(mention:Phrase): FeatureVariable = { val features = new FeatureVariable var tokens = mention.tokens.toSeq if (tokens.head.string == "the") tokens = tokens.drop(1) if (tokens.length > 0 && tokens.last.string == "'s") tokens = tokens.dropRight(1) - if (tokens.length == 0) return features // TODO Complain further here? + if (tokens.length == 0) return features // TODO Complain further here? val words = tokens.map(token => cc.factorie.app.strings.collapseDigits(token.string)) features ++= words features += "HEAD="+mention.headToken.string @@ -90,22 +57,22 @@ class OntonotesPhraseEntityTypeLabeler extends DocumentAnnotator { features } val lexicons = Seq( - lexicon.iesl.PersonFirst, - lexicon.iesl.PersonLast, - lexicon.iesl.Month, - lexicon.iesl.PersonHonorific, - lexicon.iesl.Company, - lexicon.iesl.Country, - lexicon.iesl.City, - lexicon.iesl.AllPlaces, - lexicon.iesl.UsState, - lexicon.wikipedia.Person, - lexicon.wikipedia.Event, - lexicon.wikipedia.Location, - lexicon.wikipedia.Organization, - lexicon.wikipedia.ManMadeThing, - lexicon.wikipedia.Event) - + lexicon.iesl.PersonFirst, + lexicon.iesl.PersonLast, + lexicon.iesl.Month, + lexicon.iesl.PersonHonorific, + lexicon.iesl.Company, + lexicon.iesl.Country, + lexicon.iesl.City, + lexicon.iesl.AllPlaces, + lexicon.iesl.UsState, + lexicon.wikipedia.Person, + lexicon.wikipedia.Event, + lexicon.wikipedia.Location, + lexicon.wikipedia.Organization, + lexicon.wikipedia.ManMadeThing, + lexicon.wikipedia.Event) + val PersonLexicon = new UnionLexicon("NounPhraseEntityTypePerson", PersonPronoun, PosessiveDeterminer) def isWordNetPerson(token:Token): Boolean = wordnet.WordNet.isHypernymOf("person", wordnet.WordNet.lemma(token.string, "NN")) def entityTypeIndex(mention:Phrase): Int = { @@ -114,9 +81,9 @@ class OntonotesPhraseEntityTypeLabeler extends DocumentAnnotator { } def filterTrainingNounPhrases(phrases:Seq[Phrase]): Iterable[Phrase] = - // TODO This used to filter out phrases corresponding to entities with only one mention, but now we need the Mention to do this. - // How important is this filter? -akm - // mentions.groupBy(m => m.entity).filter(x => x._2.length > 1).map(x => x._2).flatten.filter(mention => !PersonLexicon.contains(mention)) + // TODO This used to filter out phrases corresponding to entities with only one mention, but now we need the Mention to do this. + // How important is this filter? -akm + // mentions.groupBy(m => m.entity).filter(x => x._2.length > 1).map(x => x._2).flatten.filter(mention => !PersonLexicon.contains(mention)) phrases.filter(phrase => !PersonLexicon.contains(phrase)) def train(trainDocs:Iterable[Document], testDocs:Iterable[Document]): Unit = { @@ -165,27 +132,3 @@ class OntonotesPhraseEntityTypeLabeler extends DocumentAnnotator { } } - -object NounPhraseEntityTypeLabeler extends OntonotesPhraseEntityTypeLabeler(ClasspathURL[OntonotesPhraseEntityTypeLabeler](".factorie").openConnection().getInputStream) - -/* -object NounPhraseEntityTypeLabelerTrainer { - def main(args:Array[String]): Unit = { - if (args.length == 0) println("usage: trainfile [modelfile]") - var trainDocs = LoadConll2011.loadWithParse(args(0), loadSingletons=false, callDisperseEntityTypes=true) - val testDocs = trainDocs.takeRight(20) - trainDocs = trainDocs.dropRight(20) - val labeler = new OntonotesPhraseEntityTypeLabeler - for (phrase <- labeler.filterTrainingNounPhrases(testDocs.flatMap(_.getTargetCoref.mentions).map(_.phrase))) - println("%20s %s".format(phrase.attr[OntonotesPhraseEntityType].target.categoryValue, phrase)) - - labeler.train(trainDocs, testDocs) - (trainDocs ++ testDocs).foreach(doc => doc.targetCoref.mentions.map(_.phrase).foreach(labeler.process)) - for (phrase <- labeler.filterTrainingNounPhrases(testDocs.flatMap(_.getTargetCoref.mentions).map(_.phrase))) - println("%20s %-20s %-20s %s".format(phrase.attr[OntonotesPhraseEntityType].target.categoryValue, phrase.attr[OntonotesPhraseEntityType].categoryValue, labeler.isWordNetPerson(phrase.headToken).toString, phrase)) - - if (args.length > 1) labeler.serialize(args(1)) - } -} -*/ - diff --git a/src/main/scala/cc/factorie/app/nlp/phrase/OntonotesPhraseFinder.scala b/src/main/scala/cc/factorie/app/nlp/phrase/OntonotesPhraseFinder.scala new file mode 100644 index 0000000..3e68758 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/phrase/OntonotesPhraseFinder.scala @@ -0,0 +1,6 @@ +package cc.factorie.app.nlp.phrase + +import cc.factorie.app.nlp.coref.NerPhraseFinder +import cc.factorie.app.nlp.ner.OntonotesNerSpan + +object OntonotesPhraseFinder extends NerPhraseFinder[OntonotesNerSpan] diff --git a/src/main/scala/cc/factorie/app/nlp/phrase/ParseAndNerBasedPhraseFinder.scala b/src/main/scala/cc/factorie/app/nlp/phrase/ParseAndNerBasedPhraseFinder.scala new file mode 100644 index 0000000..8b1c733 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/phrase/ParseAndNerBasedPhraseFinder.scala @@ -0,0 +1,3 @@ +package cc.factorie.app.nlp.phrase + +object ParseAndNerBasedPhraseFinder extends ParseBasedPhraseFinder(true) diff --git a/src/main/scala/cc/factorie/app/nlp/phrase/ParseBasedMentionList.scala b/src/main/scala/cc/factorie/app/nlp/phrase/ParseBasedMentionList.scala new file mode 100644 index 0000000..59df6a7 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/phrase/ParseBasedMentionList.scala @@ -0,0 +1,5 @@ +package cc.factorie.app.nlp.phrase + +import cc.factorie.app.nlp.coref.{Mention, MentionList} + +class ParseBasedMentionList(spans:Iterable[Mention]) extends MentionList(spans) diff --git a/src/main/scala/cc/factorie/app/nlp/phrase/ParseBasedMention.scala b/src/main/scala/cc/factorie/app/nlp/phrase/ParseBasedPhraseFinder.scala similarity index 84% rename from src/main/scala/cc/factorie/app/nlp/phrase/ParseBasedMention.scala rename to src/main/scala/cc/factorie/app/nlp/phrase/ParseBasedPhraseFinder.scala index b6ad226..5160923 100644 --- a/src/main/scala/cc/factorie/app/nlp/phrase/ParseBasedMention.scala +++ b/src/main/scala/cc/factorie/app/nlp/phrase/ParseBasedPhraseFinder.scala @@ -1,31 +1,15 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - package cc.factorie.app.nlp.phrase +import cc.factorie.app.nlp.coref.{Mention, MentionList} +import cc.factorie.app.nlp.ner.NerTag +import cc.factorie.app.nlp.parse.ParseTree import cc.factorie.app.nlp.{Document, DocumentAnnotator, Token} import scala.collection.mutable -import cc.factorie.app.nlp.parse.ParseTree -import cc.factorie.app.nlp.coref.{AnyNerPhraseFinder, MentionList, Mention} -import cc.factorie.app.nlp.ner.NerTag - - -class ParseBasedMentionList(spans:Iterable[Mention]) extends MentionList(spans) object ParseBasedPhraseFinder extends ParseBasedPhraseFinder(false) -object ParseAndNerBasedPhraseFinder extends ParseBasedPhraseFinder(true) + class ParseBasedPhraseFinder(val useNER: Boolean) extends DocumentAnnotator { def prereqAttrs: Iterable[Class[_]] = if (!useNER) List(classOf[ParseTree]) else List(classOf[ParseTree], classOf[NerTag]) @@ -87,7 +71,7 @@ class ParseBasedPhraseFinder(val useNER: Boolean) extends DocumentAnnotator { private def personalPronounSpans(doc: Document): Seq[Phrase] = for (section <- doc.sections; s <- section.sentences; (t,i) <- s.tokens.zipWithIndex if isPersonalPronoun(t)) yield - new Phrase(section, s.start + i, 1,0) + new Phrase(section, s.start + i, 1,0) //this expects as input indices in the **document section** not the sentence //note that this never returns the root as the head, it always returns a pointer to an actual token in the sentence @@ -153,7 +137,6 @@ class ParseBasedPhraseFinder(val useNER: Boolean) extends DocumentAnnotator { .sortBy(phrase => (phrase.tokens.head.stringStart, phrase.length)) } - override def tokenAnnotationString(token:Token): String = token.document.attr[MentionList].filter(mention => mention.phrase.contains(token)) match { case ms:Seq[Mention] if ms.nonEmpty => ms.map(m => m.phrase.attr[NounPhraseType].categoryValue+":"+m.phrase.indexOf(token)).mkString(","); case _ => "_" } + override def tokenAnnotationString(token:Token): String = token.document.attr[MentionList].filter(mention => mention.phrase.contains(token)) match { case ms:Seq[Mention] if ms.nonEmpty => ms.map(m => m.phrase.attr[NounPhraseType].categoryValue+":"+m.phrase.indexOf(token)).mkString(","); case _ => "_" } } - diff --git a/src/main/scala/cc/factorie/app/nlp/phrase/Phrase.scala b/src/main/scala/cc/factorie/app/nlp/phrase/Phrase.scala index 7e23273..f01a406 100644 --- a/src/main/scala/cc/factorie/app/nlp/phrase/Phrase.scala +++ b/src/main/scala/cc/factorie/app/nlp/phrase/Phrase.scala @@ -1,90 +1,29 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - package cc.factorie.app.nlp.phrase -import cc.factorie.app.nlp._ import cc.factorie.app.nlp.parse.ParseTreeLabelDomain -import cc.factorie.app.nlp.pos.{PennPosDomain, PennPosTag} +import cc.factorie.app.nlp.pos.PennPosDomain +import cc.factorie.app.nlp.{Section, Token, TokenSpan} import cc.factorie.util.Attr + /** A Phrase is a TokenSpan that has a head token. - If offsetToHeadToken is unspecified, then it will be set automatically using HeadTokenOffset.apply. */ + * If offsetToHeadToken is unspecified, then it will be set automatically using HeadTokenOffset.apply. */ class Phrase(section:Section, start:Int, length:Int, offsetToHeadToken: Int) extends TokenSpan(section, start, length) with Attr { def this(span:TokenSpan, headTokenIndex:Int = -1) = this(span.section, span.start, span.length, headTokenIndex) - + assert(offsetToHeadToken == -1 || offsetToHeadToken >= 0 && offsetToHeadToken < length, "Offset from beginning of span, headTokenOffset="+offsetToHeadToken+", but span only has length "+length) lazy val headTokenOffset = if (offsetToHeadToken == -1) HeadTokenOffset(this) else offsetToHeadToken - + def headToken: Token = this.apply(headTokenOffset) - + def isPronoun = { val i = headToken.posTag.intValue; i == PennPosDomain.prpIndex || i == PennPosDomain.prpdIndex || i == PennPosDomain.wpIndex || i == PennPosDomain.wpdIndex } def isProperNoun = { val i = headToken.posTag.intValue; i == PennPosDomain.nnpIndex || i == PennPosDomain.nnpsIndex } def isNoun = headToken.posTag.categoryValue(0) == 'N' - def isPossessive = headToken.posTag.intValue == PennPosDomain.posIndex + def isPossessive = headToken.posTag.intValue == PennPosDomain.posIndex def isAppositionOf(other:Phrase) : Boolean = (headToken.parseLabel.intValue == ParseTreeLabelDomain.appos) && (headToken.parseParent == other.headToken) def gender = this.attr[Gender] def number = this.attr[Number] def nounPhraseType = this.attr[NounPhraseType] - -} - -/** A collection of Phrases. Typically used as an attribute of a Section or a Document. */ -class PhraseList(spans:Iterable[Phrase]) extends TokenSpanList[Phrase](spans) - -/** A collection of Phrases that are noun phrases. Typically used as an attribute of a Section or a Document. */ -class NounPhraseList(phrases:Iterable[Phrase]) extends PhraseList(phrases) - -/** A collection of VerbPhrases. Typically used as an attribute of a Section or a Document. */ -class VerbPhraseList(phrases:Iterable[Phrase]) extends PhraseList(phrases) - - -/** A heuristic for selecting the head of a phrase. - If a parse is available, use it to find the head; if a preposition is found, select the word before it; otherwise simply select the last token. */ -object HeadTokenOffset { - def apply(phrase:Phrase): Int = { - if (phrase.length == 1) return 0 - val span = phrase.value - val sentence = phrase.sentence - // If there is a parse, then traverse up the tree until just before we exit the Span - val parse = sentence.parse - if (parse ne null) { - var headSentenceIndex = math.min(span.end, sentence.end)-1 - sentence.start - var parentSentenceIndex = parse.parentIndex(headSentenceIndex) - while (span.contains(parentSentenceIndex + sentence.start)) { - headSentenceIndex = parentSentenceIndex - parentSentenceIndex = parse.parentIndex(parentSentenceIndex) - } - //Sometimes phrases are broken, consisting of more than one subgraph in the parse tree; check if parent of exit is not again part of mention - if(parentSentenceIndex >= 0) { - parentSentenceIndex = parse.parentIndex(parentSentenceIndex) - while (span.contains(parentSentenceIndex + sentence.start)) { - headSentenceIndex = parentSentenceIndex - parentSentenceIndex = parse.parentIndex(parentSentenceIndex) - } - } - return headSentenceIndex + sentence.start - span.start - } else { - // If there is a preposition, select the word just before the first preposition - val prepositionIndex = span.indexWhere(cc.factorie.app.nlp.lexicon.Preposition.contains(_)) - if (prepositionIndex >= 1) return prepositionIndex - 1 - // If there is noun, return the last noun - val lastNounIndex = span.lastIndexWhere(_.attr[PennPosTag].isNoun) - if (lastNounIndex > 0) return lastNounIndex - // Otherwise simply select the last word of the span - else return span.length-1 - } - } } diff --git a/src/main/scala/cc/factorie/app/nlp/phrase/PhraseGender.scala b/src/main/scala/cc/factorie/app/nlp/phrase/PhraseGender.scala new file mode 100644 index 0000000..a6dcff2 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/phrase/PhraseGender.scala @@ -0,0 +1,5 @@ +package cc.factorie.app.nlp.phrase + +class PhraseGender(val phrase:Phrase, categoryIndex:Int) extends Gender(categoryIndex) { + def this(phrase:Phrase, category:String) = this(phrase, GenderDomain.index(category)) +} diff --git a/src/main/scala/cc/factorie/app/nlp/phrase/PhraseGenderLabeler.scala b/src/main/scala/cc/factorie/app/nlp/phrase/PhraseGenderLabeler.scala new file mode 100644 index 0000000..06f513f --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/phrase/PhraseGenderLabeler.scala @@ -0,0 +1,157 @@ +package cc.factorie.app.nlp.phrase + +import cc.factorie.app.nlp.coref.PronounSets +import cc.factorie.app.nlp.lexicon.{LexiconsProvider, StaticLexicons} +import cc.factorie.app.nlp.{Document, DocumentAnnotator, Token} + +import scala.reflect.ClassTag + +/** Cheap gender predictor based on rules and lexicon. */ +class PhraseGenderLabeler[A<:AnyRef](documentAttrToPhrases:(A)=>Iterable[Phrase])(implicit docAttrClass:ClassTag[A]) extends DocumentAnnotator { + + // todo fix this + @deprecated("This exists to preserve prior behavior, it should be a constructor argument", "10/5/15") + val lexicon = new StaticLexicons()(LexiconsProvider.classpath()) + + def process(document:Document): Document = { + for (phrase <- documentAttrToPhrases(document.attr[A])) process(phrase) + document + } + def process(phrase:Phrase): Unit = { + import GenderDomain._ + val gender = new PhraseGender(phrase, UNKNOWN) + phrase.attr += gender + if (phrase.length > 0) { + val genderFromLexicon = lexiconGender(phrase) + if (genderFromLexicon.isDefined) gender := genderFromLexicon.get + else { + val firstWord = phrase(0).string.toLowerCase + val lastWord = phrase.last.string.toLowerCase + var firstName = firstWord + if (lexicon.iesl.PersonHonorific.containsWord(firstWord)) { + gender := PERSON + if (maleHonors.contains(firstWord)) gender := MALE + else if (femaleHonors.contains(firstWord)) gender := FEMALE + if (phrase.length >= 3) firstName = phrase(1).string.toLowerCase + } + if (gender.intValue != MALE && gender.intValue != FEMALE) { + if (lexicon.iesl.Month.containsWord(firstWord)) gender := NEUTER + else if (lexicon.uscensus.PersonFirstMale.containsWord(firstName)) gender := MALE + else if (lexicon.uscensus.PersonFirstFemale.containsWord(firstName) && firstName != "an") gender := FEMALE + else if (gender.intValue == GenderDomain.UNKNOWN && lexicon.iesl.PersonLast.containsWord(lastWord)) gender := PERSON + if (lexicon.iesl.City.contains(phrase) || lexicon.iesl.Country.contains(phrase) || lexicon.iesl.OrgSuffix.containsWord(lastWord)) + if (gender.intValue == UNKNOWN) gender := NEUTER else gender := UNKNOWN // Could be either person or other; mark it unknown + } + } + } + } + + /** Test various words in the phrase to see if they indicate gender. Return an index into the NounPhraseGenderDomain. */ + def lexiconGender(phrase: Phrase): Option[Int] = { + if (phrase.length == 1) lexiconGender(phrase.tokens(0).string) + else if (phrase.length == 2) lexiconGender(phrase.tokens(0).string).orElse(lexiconGender(phrase.tokens(1).string)) + else lexiconGender(phrase.headToken.string).orElse(lexiconGender(phrase.tokens(0).string).orElse(lexiconGender(phrase.tokens(1).string))) + } + def lexiconGender(word:String): Option[Int] = { + val lemma = word.toLowerCase + if (maleWords.contains(lemma)) Some(GenderDomain.MALE) + else if (femaleWords.contains(lemma)) Some(GenderDomain.FEMALE) + else if (PronounSets.neuter.contains(lemma)) Some(GenderDomain.NEUTER) + else if (PronounSets.allPersonPronouns.contains(lemma)) Some(GenderDomain.PERSON) + else None + } + + //since lemmaString is singular, we don't need to hard code in the plural form of these words + val maleHonors = Set("mr.", "mr", "mister") + val femaleHonors = Set("ms.", "ms", "mrs.", "mrs", "miss", "misses") + + val maleFemaleWords = Seq( + ("", "actress"), + ("", "adulteress"), + ("", "giantess"), + ("", "heiress"), + ("", "hostess"), + ("", "poetess"), + ("", "shepherdess"), + ("baron", "baroness"), + ("boar", "sow"), + ("boy", "girl"), + ("boy-friend", "girl-friend"), + ("boyfriend", "girlfriend"), + ("bridegroom", "bride"), + ("bro", "sis"), + ("brother", "sister"), + ("brother-in-law", "sister-in-law"), + ("buck", "roe"), + ("bull", "cow"), + ("chap", ""), + ("cock", "hen"), + ("codger", ""), + ("count", "countess"), + ("dad", "mom"), + ("dad", "mum"), + ("daddy", "mommy"), + ("deacon", "deaconess"), + ("dude", "dame"), + ("duke", "duchess"), + ("emperor", "empress"), + ("father", "mother"), + ("father-in-law", "mother-in-law"), + ("fiance", "fiancee"), + ("fianc\u00E9", "fianc\u00E9e"), + ("gigolo", "prostitute"), + ("godfather", "godmother"), + ("godson", "goddaughter"), + ("grandfather", "grandmother"), + ("grandpa", "grandma"), + ("grandson", "granddaughter"), + ("guy", "gal"), + ("he", "she"), + ("hero", "heroine"), + ("him", "her"), + ("his", "hers"), + ("husband", "wife"), + ("king", "queen"), + ("lad", "lass"), + ("landlord", "landlady"), + ("lion", "lioness"), + ("lord", "lady"), + ("male", "female"), + ("man", "woman"), + ("manservant", "maidservant"), + ("master", "mistress"), + ("men", "women"), + ("monk", "nun"), + ("nephew", "niece"), + ("pa", "ma"), + ("papa", "mama"), + ("papa", "mamma"), + ("papa", "momma"), + ("peacock", "peahen"), + ("pop", "mom"), + ("pope", ""), + ("priest", "priestess"), + ("prince", "princess"), + ("ram", "ewe"), + ("sir", "madam"), + ("sir", "ma'am"), + ("son-in-law", "daughter-in-law"), + ("stallion", "mare"), + ("step-father", "step-mother"), + ("step-son", "step-daughter"), + ("steward", "stewardess"), + ("tiger", "tigress"), + ("tom", "tib"), // cat or elephant + ("uncle", "aunt"), + ("waiter", "waitress"), + ("widower", "widow") + ) + val maleWords = maleFemaleWords.map(_._1).filter(_.length > 0).toSet + val femaleWords = maleFemaleWords.map(_._2).filter(_.length > 0).toSet + + override def tokenAnnotationString(token:Token): String = { val phrases = documentAttrToPhrases(token.document.attr[A]).filter(_.contains(token)); phrases.map(_.attr[Gender].categoryValue).mkString(",") } + override def phraseAnnotationString(phrase:Phrase): String = { val t = phrase.attr[Gender]; if (t ne null) t.categoryValue else "_" } + def prereqAttrs: Iterable[Class[_]] = List(docAttrClass.runtimeClass)//Require some TokenSpanList containing subclass of Phrase elements + // Note that this postAttr doesn't indicate if all Phrases or just some Mention Phrases were actually labeled. + def postAttrs: Iterable[Class[_]] = List(classOf[PhraseGender]) +} diff --git a/src/main/scala/cc/factorie/app/nlp/phrase/PhraseList.scala b/src/main/scala/cc/factorie/app/nlp/phrase/PhraseList.scala new file mode 100644 index 0000000..8b50858 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/phrase/PhraseList.scala @@ -0,0 +1,6 @@ +package cc.factorie.app.nlp.phrase + +import cc.factorie.app.nlp.TokenSpanList + +/** A collection of Phrases. Typically used as an attribute of a Section or a Document. */ +class PhraseList(spans:Iterable[Phrase]) extends TokenSpanList[Phrase](spans) diff --git a/src/main/scala/cc/factorie/app/nlp/phrase/PhraseNumber.scala b/src/main/scala/cc/factorie/app/nlp/phrase/PhraseNumber.scala new file mode 100644 index 0000000..9e57687 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/phrase/PhraseNumber.scala @@ -0,0 +1,6 @@ +package cc.factorie.app.nlp.phrase + +class PhraseNumber(val phrase:Phrase, value:Int) extends Number(value) { + def this(phrase:Phrase, value:String) = this(phrase, NumberDomain.index(value)) +} + diff --git a/src/main/scala/cc/factorie/app/nlp/phrase/PosBasedNounPhraseFinder.scala b/src/main/scala/cc/factorie/app/nlp/phrase/PosBasedNounPhraseFinder.scala deleted file mode 100644 index b209e62..0000000 --- a/src/main/scala/cc/factorie/app/nlp/phrase/PosBasedNounPhraseFinder.scala +++ /dev/null @@ -1,49 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.app.nlp.phrase -import cc.factorie.app.nlp._ - -import scala.collection.mutable.ListBuffer - -/** Find noun phrases merely by contiguous nouns (possibly prefixed by adjectives) and pronouns. - This is simple but much less accurate than ChainChunker. - @author Andrew McCallum */ -object PosBasedNounPhraseFinder extends DocumentAnnotator { - def process(document:Document): Document = { - document.attr += new NounPhraseList(phrases(document)) - document - } - def phrases(document:Document): Seq[Phrase] = { - val phrases = new ListBuffer[Phrase]() - var tempSpan: Phrase = null - for (section <- document.sections; token <- section.tokens) { - // Put a span around contiguous sequences of NN or PR part-of-speech prefixes - val posPrefix = token.attr[pos.PennPosTag].categoryValue.take(2) - if (posPrefix == "NN" || posPrefix == "PR" || (posPrefix == "JJ" && token.hasNext && token.next.attr[pos.PennPosTag].categoryValue.take(2) == "NN")) { - if (tempSpan eq null) tempSpan = new Phrase(section, token.position, 1,offsetToHeadToken = -1) - else tempSpan.append(1)(null) - } else if (tempSpan ne null) { - if (token.string == "-" && token.hasNext && token.next.attr[pos.PennPosTag].categoryValue.take(2) == "NN") tempSpan.append(1)(null) // Handle dashed nouns - else { phrases += tempSpan; tempSpan = null} - } - } - phrases - } - override def tokenAnnotationString(token:Token): String = { - val phrases = token.document.attr[NounPhraseList].spansContaining(token) - if (phrases.isEmpty) return null - phrases.map(c => if (c.head == token) "B-NP" else "I-NP").mkString(",") - } - def prereqAttrs: Iterable[Class[_]] = List(classOf[pos.PennPosTag]) - def postAttrs: Iterable[Class[_]] = List(classOf[NounPhraseList]) -} diff --git a/src/main/scala/cc/factorie/app/nlp/phrase/VerbPhraseList.scala b/src/main/scala/cc/factorie/app/nlp/phrase/VerbPhraseList.scala new file mode 100644 index 0000000..1860c16 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/phrase/VerbPhraseList.scala @@ -0,0 +1,4 @@ +package cc.factorie.app.nlp.phrase + +/** A collection of VerbPhrases. Typically used as an attribute of a Section or a Document. */ +class VerbPhraseList(phrases:Iterable[Phrase]) extends PhraseList(phrases) diff --git a/src/main/scala/cc/factorie/app/nlp/pos/ChainPosTagger.scala b/src/main/scala/cc/factorie/app/nlp/pos/ChainPosTagger.scala deleted file mode 100644 index 7e60177..0000000 --- a/src/main/scala/cc/factorie/app/nlp/pos/ChainPosTagger.scala +++ /dev/null @@ -1,319 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -/* -package cc.factorie.app.nlp.pos - -import java.io._ - -import cc.factorie.app.chain.ChainModel -import cc.factorie.app.chain.Observations._ -import cc.factorie.app.nlp._ -import cc.factorie.optimize.Trainer -import cc.factorie.util.{BinarySerializer, ClasspathURL, HyperparameterMain} -import cc.factorie.variable.{BinaryFeatureVectorVariable, CategoricalVectorDomain, HammingObjective, LabeledMutableDiscreteVar, LabeledVar} - -import scala.reflect.ClassTag -*/ -/** A linear-chain CRF part-of-speech tagger, doing inference by Viterbi. - @author Alexandre Passos, Andrew McCallum - */ -/* -abstract class ChainPosTagger[A<:PosTag](val tagConstructor:(Token)=>A)(implicit ct:ClassTag[A]) extends DocumentAnnotator { - def this(tagConstructor:(Token)=>A, url:java.net.URL)(implicit ct:ClassTag[A]) = { this(tagConstructor); deserialize(url.openConnection().getInputStream) } - def process(document: Document) = { - document.sentences.foreach(s => { - if (s.nonEmpty) { - s.tokens.foreach(t => if (!t.attr.contains[A]) t.attr += tagConstructor(t)) - initPOSFeatures(s) - model.maximize(s.tokens.map(_.attr[A]))(null) - } - }) - document - } - - def prereqAttrs = Seq(classOf[Token], classOf[Sentence]) - def postAttrs = Seq(ct.runtimeClass) - def tokenAnnotationString(token: Token) = { val label = token.attr[A with LabeledVar]; if (label ne null) label.categoryValue else "(null)" } - - def serialize(stream: OutputStream) { - import cc.factorie.util.CubbieConversions._ - val dstream = new DataOutputStream(new BufferedOutputStream(stream)) - BinarySerializer.serialize(PosFeaturesDomain.dimensionDomain, dstream) - BinarySerializer.serialize(model, dstream) - dstream.close() - } - def deserialize(stream: InputStream) { - import cc.factorie.util.CubbieConversions._ - val dstream = new DataInputStream(new BufferedInputStream(stream)) - BinarySerializer.deserialize(PosFeaturesDomain.dimensionDomain, dstream) - BinarySerializer.deserialize(model, dstream) - dstream.close() - } - - def train(trainSentences:Seq[Sentence], - testSentences:Seq[Sentence], - lrate:Double = 0.1, - decay:Double = 0.01, - cutoff:Int = 2, - doBootstrap:Boolean = true, - useHingeLoss:Boolean = false, - numIterations: Int = 5, - l1Factor:Double = 0.000001, - l2Factor:Double = 0.000001)(implicit random: scala.util.Random) { - // TODO Accomplish this TokenNormalization instead by calling POS3.preProcess - println("Initializing POS features for training sentences") - trainSentences.foreach(initPOSFeatures) - println("Finished initializing POS features for training sentences") - PosFeaturesDomain.freeze() - println("Initializing POS features for testing sentences") - testSentences.foreach(initPOSFeatures) - println("Finished initializing POS features for testing sentences") - def evaluate() { - println("Evaluating") - (trainSentences ++ testSentences).foreach(s => model.maximize(s.tokens.map(_.attr[A with LabeledVar]))(null) ) - println("Train accuracy: "+ HammingObjective.accuracy(trainSentences.flatMap(s => s.tokens.map(_.attr[A with LabeledVar])))) - println("Test accuracy: "+ HammingObjective.accuracy(testSentences.flatMap(s => s.tokens.map(_.attr[A with LabeledVar])))) - } - val examples = - if(useHingeLoss) - trainSentences.map(sentence => new model.ChainStructuredSVMExample(sentence.tokens.map(_.attr[A with LabeledMutableDiscreteVar]))).toSeq - else - trainSentences.map(sentence => new model.ChainLikelihoodExample(sentence.tokens.map(_.attr[A with LabeledMutableDiscreteVar]))) - //val optimizer = new cc.factorie.optimize.AdaGrad(rate=lrate) - val optimizer = new cc.factorie.optimize.AdaGradRDA(rate=lrate, l1=l1Factor/examples.length, l2=l2Factor/examples.length) - println("Running Parameter Optimization") - Trainer.onlineTrain(model.parameters, examples, maxIterations=numIterations, optimizer=optimizer, evaluate=evaluate, useParallelTrainer = false) - } - - - object PosFeaturesDomain extends CategoricalVectorDomain[String] - class PosFeatures(val token:Token) extends BinaryFeatureVectorVariable[String] { def domain = PosFeaturesDomain; override def skipNonCategories = true } - - val posDomain = tagConstructor(null).domain - val model = new ChainModel[A, PosFeatures, Token](posDomain, - PosFeaturesDomain, - l => l.token.attr[PosFeatures], - l => l.token, - t => t.attr[A]){ - useObsMarkov = false - } - - def initPOSFeatures(sentence: Sentence): Unit -} -*/ -/* -class WSJChainPosTagger extends ChainPosTagger((t:Token) => new PennPosTag(t, 0)) with Serializable { - def this(url: java.net.URL) = { - this() - deserialize(url.openConnection().getInputStream) - } - - def initPOSFeatures(sentence: Sentence): Unit = { - import cc.factorie.app.strings.simplifyDigits - for (token <- sentence.tokens) { - if(token.attr[PosFeatures] ne null) - token.attr.remove[PosFeatures] - - val features = token.attr += new PosFeatures(token) - val rawWord = token.string - val word = simplifyDigits(rawWord).toLowerCase - features += "W="+word - features += "STEM=" + cc.factorie.app.strings.porterStem(word) - features += "SHAPE2=" + cc.factorie.app.strings.stringShape(rawWord, 2) - features += "SHAPE3=" + cc.factorie.app.strings.stringShape(rawWord, 3) - // pre/suf of length 1..9 - //for (i <- 1 to 9) { - val i = 3 - features += "SUFFIX" + i + "=" + word.takeRight(i) - features += "PREFIX" + i + "=" + word.take(i) - //} - if (token.isCapitalized) features += "CAPITALIZED" - if (token.string.matches("[A-Z]")) features += "CONTAINS_CAPITAL" - if (token.string.matches("-")) features += "CONTAINS_DASH" - if (token.containsDigit) features += "NUMERIC" - if (token.isPunctuation) features += "PUNCTUATION" - } - addNeighboringFeatureConjunctions(sentence.tokens, (t: Token) => t.attr[PosFeatures], "W=[^@]*$", List(-2), List(-1), List(1), List(-2,-1), List(-1,0)) - } -} -object WSJChainPosTagger extends WSJChainPosTagger(ClasspathURL[WSJChainPosTagger](".factorie")) -*/ -/* -class OntonotesChainPosTagger extends ChainPosTagger((t:Token) => new PennPosTag(t, 0)) with Serializable { - def this(url: java.net.URL) = { - this() - deserialize(url.openConnection().getInputStream) - } - - def initPOSFeatures(sentence: Sentence): Unit = { - import cc.factorie.app.strings.simplifyDigits - for (token <- sentence.tokens) { - if(token.attr[PosFeatures] ne null) - token.attr.remove[PosFeatures] - - val features = token.attr += new PosFeatures(token) - val rawWord = token.string - val word = simplifyDigits(rawWord).toLowerCase - features += "W="+word - features += "STEM=" + cc.factorie.app.strings.porterStem(word) - features += "SHAPE2=" + cc.factorie.app.strings.stringShape(rawWord, 2) - features += "SHAPE3=" + cc.factorie.app.strings.stringShape(rawWord, 3) - // pre/suf of length 1..9 - //for (i <- 1 to 9) { - val i = 3 - features += "SUFFIX" + i + "=" + word.takeRight(i) - features += "PREFIX" + i + "=" + word.take(i) - //} - if (token.isCapitalized) features += "CAPITALIZED" - if (token.string.matches("[A-Z]")) features += "CONTAINS_CAPITAL" - if (token.string.matches("-")) features += "CONTAINS_DASH" - if (token.containsDigit) features += "NUMERIC" - if (token.isPunctuation) features += "PUNCTUATION" - } - addNeighboringFeatureConjunctions(sentence.tokens, (t: Token) => t.attr[PosFeatures], "W=[^@]*$", List(-2), List(-1), List(1), List(-2,-1), List(-1,0)) - } -} - -object OntonotesChainPosTagger extends OntonotesChainPosTagger(ClasspathURL[OntonotesChainPosTagger](".factorie")) with Serializable -*/ - -/* -class ChainPosTrainer[A<:PosTag, B<:ChainPosTagger[A]](taggerConstructor: () => B, loadingMethod:(String) => Seq[Document])(implicit ct:ClassTag[A]) extends HyperparameterMain { - def evaluateParameters(args: Array[String]): Double = { - implicit val random = new scala.util.Random(0) - val opts = new ForwardPosOptions - opts.parse(args) - assert(opts.trainDir.wasInvoked) - // Expects three command-line arguments: a train file, a test file, and a place to save the model in - // the train and test files are supposed to be in OWPL format - val pos = taggerConstructor() - - val trainDocs = loadingMethod(opts.trainDir.value) - println("NUM TRAIN DOCS:" + trainDocs.size) - val testDocs = loadingMethod(opts.testDir.value) - println("NUM TEST DOCS:" + testDocs.size) - - //for (d <- trainDocs) println("POS3.train 1 trainDoc.length="+d.length) - println("Read %d training tokens.".format(trainDocs.map(_.tokenCount).sum)) - println("Read %d testing tokens.".format(testDocs.map(_.tokenCount).sum)) - - val trainPortionToTake = if(opts.trainPortion.wasInvoked) opts.trainPortion.value.toDouble else 1.0 - val testPortionToTake = if(opts.testPortion.wasInvoked) opts.testPortion.value.toDouble else 1.0 - println("Flatmapping Training Sentences") - val trainSentencesFull = trainDocs.flatMap(_.sentences) - val trainSentences = trainSentencesFull.take((trainPortionToTake*trainSentencesFull.length).floor.toInt) - println("Finished Flatmapping Training Sentences") - println("Flatmapping Testing Sentences") - val testSentencesFull = testDocs.flatMap(_.sentences) - val testSentences = testSentencesFull.take((testPortionToTake*testSentencesFull.length).floor.toInt) - println("Finished Flatmapping Testing Sentences") - - println("Training") - pos.train(trainSentences, - testSentences, - opts.rate.value, - opts.delta.value, - opts.cutoff.value, - opts.updateExamples.value, - opts.useHingeLoss.value, - l1Factor=opts.l1.value, - l2Factor=opts.l2.value) - println("Finished Training") - if (opts.saveModel.value) { - println("Serializing Model") - pos.serialize(new FileOutputStream(new File(opts.modelFile.value))) - println("Finished Serializing Model") - val pos2 = taggerConstructor() - println("Deserializing Model") - pos2.deserialize(new FileInputStream(new java.io.File(opts.modelFile.value))) - println("Finished Deserializing Model") - } - val acc = HammingObjective.accuracy(testDocs.flatMap(d => d.sentences.flatMap(s => s.tokens.map(_.attr[A with LabeledVar])))) - if(opts.targetAccuracy.wasInvoked) cc.factorie.assertMinimalAccuracy(acc,opts.targetAccuracy.value.toDouble) - - acc - } -} -object OntonotesChainPosTrainer extends ChainPosTrainer[PennPosTag, OntonotesChainPosTagger]( - () => new OntonotesChainPosTagger(), - (dirName: String) => load.LoadOntonotes5.fromFilename(dirName) -) -*/ -/* -object ChainPosOptimizer { - def main(args: Array[String]) { - val opts = new ForwardPosOptions - val trainerName = args(0) - opts.parse(args.slice(1, opts.size)) - opts.saveModel.setValue(false) - val l1 = cc.factorie.util.HyperParameter(opts.l1, new cc.factorie.util.LogUniformDoubleSampler(1e-10, 1e2)) - val l2 = cc.factorie.util.HyperParameter(opts.l2, new cc.factorie.util.LogUniformDoubleSampler(1e-10, 1e2)) - val rate = cc.factorie.util.HyperParameter(opts.rate, new cc.factorie.util.LogUniformDoubleSampler(1e-4, 1e4)) - val delta = cc.factorie.util.HyperParameter(opts.delta, new cc.factorie.util.LogUniformDoubleSampler(1e-4, 1e4)) - val cutoff = cc.factorie.util.HyperParameter(opts.cutoff, new cc.factorie.util.SampleFromSeq(List(0,1,2,3))) - val qs = new cc.factorie.util.QSubExecutor(60, trainerName) - val optimizer = new cc.factorie.util.HyperParameterSearcher(opts, Seq(l1, l2, rate, delta, cutoff), qs.execute, 200, 180, 60) - val result = optimizer.optimize() - println("Got results: " + result.mkString(" ")) - println("Best l1: " + opts.l1.value + " best l2: " + opts.l2.value) - opts.saveModel.setValue(true) - println("Running best configuration...") - import scala.concurrent.Await - import scala.concurrent.duration._ - Await.result(qs.execute(opts.values.flatMap(_.unParse).toArray), 5.hours) - println("Done") - } -} - - -class SpanishChainPosTagger extends ChainPosTagger((t:Token) => new SpanishPosTag(t, 0)) { - def this(url: java.net.URL) = { - this() - deserialize(url.openConnection().getInputStream) - } - - def initPOSFeatures(sentence: Sentence): Unit = { - import cc.factorie.app.strings.simplifyDigits - for (token <- sentence.tokens) { - if(token.attr[PosFeatures] ne null) - token.attr.remove[PosFeatures] - - val features = token.attr += new PosFeatures(token) - val rawWord = token.string - val word = simplifyDigits(rawWord).toLowerCase - features += "W="+word - features += "STEM=" + cc.factorie.app.strings.porterStem(word) - features += "SHAPE2=" + cc.factorie.app.strings.stringShape(rawWord, 2) - features += "SHAPE3=" + cc.factorie.app.strings.stringShape(rawWord, 3) - // pre/suf of length 1..9 - //for (i <- 1 to 9) { - val i = 3 - features += "SUFFIX" + i + "=" + word.takeRight(i) - features += "PREFIX" + i + "=" + word.take(i) - //} - if (token.isCapitalized) features += "CAPITALIZED" - if (token.string.matches("[A-Z]")) features += "CONTAINS_CAPITAL" - if (token.string.matches("-")) features += "CONTAINS_DASH" - if (token.containsDigit) features += "NUMERIC" - if (token.isPunctuation) features += "PUNCTUATION" - } - addNeighboringFeatureConjunctions(sentence.tokens, (t: Token) => t.attr[PosFeatures], "W=[^@]*$", List(-2), List(-1), List(1), List(-2,-1), List(-1,0)) - } -} -object SpanishChainPosTagger extends SpanishChainPosTagger(ClasspathURL[SpanishChainPosTagger](".factorie")) -object SpanishChainPosTrainer extends ChainPosTrainer[SpanishPosTag, SpanishChainPosTagger]( - () => new SpanishChainPosTagger(), - (dirName: String) => load.LoadSpanishConll2008.fromFilename(dirName) -) -*/ \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/nlp/pos/CtbChainPosTagger.scala b/src/main/scala/cc/factorie/app/nlp/pos/CtbChainPosTagger.scala deleted file mode 100644 index a3d6fef..0000000 --- a/src/main/scala/cc/factorie/app/nlp/pos/CtbChainPosTagger.scala +++ /dev/null @@ -1,215 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -/* -package cc.factorie.app.nlp.pos - -import java.io._ -import java.util.{HashMap, HashSet} - -import cc.factorie.app.chain.Observations._ -import cc.factorie.app.nlp.{Document, Sentence, Token} -import cc.factorie.util.{BinarySerializer, ClasspathURL} -*/ -/** - * Created by Oskar Singer on 10/6/14. - */ -/* -class CtbChainPosTagger extends ChainPosTagger((t:Token) => new CtbPosTag(t, 0)) { - - private var prefixMap = new HashMap[Char, HashSet[String]] - private var suffixMap = new HashMap[Char, HashSet[String]] - - def this(url: java.net.URL) = { - this() - deserialize(url.openConnection().getInputStream) - } - - override def train(trainSentences:Seq[Sentence], - testSentences:Seq[Sentence], - lrate:Double = 0.1, - decay:Double = 0.01, - cutoff:Int = 2, - doBootstrap:Boolean = true, - useHingeLoss:Boolean = false, - numIterations: Int = 5, - l1Factor:Double = 0.000001, - l2Factor:Double = 0.000001)(implicit random: scala.util.Random): Unit = { - initPrefixAndSuffixMaps(trainSentences.flatMap(_.tokens)) - super.train(trainSentences, testSentences, lrate, decay, cutoff, doBootstrap, useHingeLoss, numIterations, l1Factor, l2Factor) - } - - def initPOSFeatures(sentence: Sentence): Unit = { - import cc.factorie.app.chineseStrings._ - - for (token <- sentence.tokens) { - if(token.attr[PosFeatures] ne null) - token.attr.remove[PosFeatures] - - val features = token.attr += new PosFeatures(token) - val rawWord = token.string - val prefix = rawWord(0) - val suffix = rawWord(rawWord.size - 1) - - features += "W="+rawWord - - (0 to 4).foreach { - i => - features += "SUFFIX" + i + "=" + rawWord.takeRight(i) - features += "PREFIX" + i + "=" + rawWord.take(i) - } - - if(prefixMap.containsKey(prefix)) { - val prefixLabelSet = prefixMap.get(prefix) - val prefixCTBMorph = posDomain.categories.map{ - category => - - val hasCategory = { - if(prefixLabelSet.contains(category)) - "TRUE" - else - "FALSE" - } - - "PRE_" + category + "_" + hasCategory - } - - features ++= prefixCTBMorph - } - - if(suffixMap.containsKey(suffix)) { - val suffixLabelSet = suffixMap.get(suffix) - val suffixCTBMorph = posDomain.categories.map{ - category => - - val hasCategory = { - if(suffixLabelSet.contains(category)) - "TRUE" - else - "FALSE" - } - - "SUF_" + category + "_" + hasCategory - } - - features ++= suffixCTBMorph - } - - if (hasPunctuation(rawWord)) features += "PUNCTUATION" - /* - if (hasNumeric(rawWord)) features += "NUMERIC" - if (hasChineseNumeric(rawWord)) features += "CHINESE_NUMERIC" - if (hasAlpha(rawWord)) features += "ALPHA" - */ - } - - addNeighboringFeatureConjunctions(sentence.tokens, - (t: Token) => t.attr[PosFeatures], - "W=[^@]*$", - List(-2), - List(-1), - List(1), - List(-2,-1), - List(-1,0)) - } - - def initPrefixAndSuffixMaps(tokens: Seq[Token]): Unit = { - prefixMap.clear() - suffixMap.clear() - - tokens.map( - token => (token.string, token.attr[LabeledCtbPosTag].categoryValue) - ).foreach{ - case (word, label) => - - val prefix = word(0) - val suffix = word(word.size - 1) - - val prefixLabelSet = prefixMap.get(prefix) - - if(prefixLabelSet != null) { - if(!prefixLabelSet.contains(label)) { - prefixLabelSet.add(label) - } - } else { - val labelSet = new HashSet[String] - - labelSet.add(label) - prefixMap.put(prefix, labelSet) - } - - val suffixLabelSet = suffixMap.get(suffix) - - if(suffixLabelSet != null) { - if(!suffixLabelSet.contains(label)) { - suffixLabelSet.add(label) - } - } else { - val labelSet = new HashSet[String] - - labelSet.add(label) - suffixMap.put(suffix, labelSet) - } - } - - println("PREFIX MAP SIZE: " + prefixMap.size()) - println("SUFFIX MAP SIZE: " + suffixMap.size()) - } - - override def serialize(stream: OutputStream) { - import cc.factorie.util.CubbieConversions._ - val dstream = new DataOutputStream(new BufferedOutputStream(stream)) - val out = new ObjectOutputStream(dstream) - out.writeObject(prefixMap) - out.writeObject(suffixMap) - BinarySerializer.serialize(PosFeaturesDomain.dimensionDomain, dstream) - BinarySerializer.serialize(model, dstream) - dstream.close() - out.close() - } - override def deserialize(stream: InputStream) { - import cc.factorie.util.CubbieConversions._ - val dstream = new DataInputStream(new BufferedInputStream(stream)) - val in = new ObjectInputStream(dstream) - prefixMap = in.readObject().asInstanceOf[HashMap[Char, HashSet[String]]] - suffixMap = in.readObject().asInstanceOf[HashMap[Char, HashSet[String]]] - BinarySerializer.deserialize(PosFeaturesDomain.dimensionDomain, dstream) - BinarySerializer.deserialize(model, dstream) - dstream.close() - in.close() - } -} -object CtbChainPosTagger extends CtbChainPosTagger(ClasspathURL[CtbChainPosTagger](".factorie")) - -object CtbChainPosTrainer extends ChainPosTrainer[CtbPosTag, CtbChainPosTagger]( - () => new CtbChainPosTagger(), - (dirName: String) => { - val directory = new File(dirName) - - val documents = - (for{ - file <- directory.listFiles - if file.isFile - document = new Document - line <- scala.io.Source.fromFile(file, "utf-8").getLines - if line.size > 0 && line(0) != '<' - sentence = new Sentence(document) - (word, label) <- line.split(' ').map( pair => {val (word, label) = pair.splitAt(pair.lastIndexOf('_')); (word, label.slice(1,label.size))} ) - token = new Token(sentence, word) - labeledTag = token.attr += new LabeledCtbPosTag(token, label) - } yield document - ).toIndexedSeq.distinct - - documents - } -) -*/ \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/nlp/pos/CtbPosTag.scala b/src/main/scala/cc/factorie/app/nlp/pos/CtbPosTag.scala deleted file mode 100644 index 4023a58..0000000 --- a/src/main/scala/cc/factorie/app/nlp/pos/CtbPosTag.scala +++ /dev/null @@ -1,81 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.app.nlp.pos - -import cc.factorie.app.nlp._ -import cc.factorie.variable._ - -/** - * Created by oskar on 9/23/14. - */ -object CtbPosDomain extends CategoricalDomain[String] { - this ++= Vector( - "VA", - "VC", - "VE", - "VV", - "NR", - "NT", - "NN", - "LC", - "PN", - "DT", - "CD", - "OD", - "M", - "X", - "AD", - "P", - "CC", - "CS", - "DEC", - "DEG", - "DER", - "DEV", - "SP", - "AS", - "ETC", - "SP", - "MSP", - "IJ", - "ON", - "PU", - "JJ", - "FW", - "LB", - "SB", - "BA", - "URL" - ) - freeze() - - def isNoun(pos:String): Boolean = pos(0) == 'N' - def isProperNoun(pos:String) = { pos == "NR" } - def isVerb(pos:String) = pos(0) == 'V' - def isAdjective(pos:String) = pos(0) == 'J' - def isPersonalPronoun(pos: String) = pos == "PRP" -} - -class CtbPosTag(token: Token, initialIndex: Int) extends PosTag(token, initialIndex) { - def this(token: Token, initialCategory: String) = { - this(token, CtbPosDomain.index(initialCategory.split('-')(0))) - } - final def domain = CtbPosDomain - def isNoun = domain.isNoun(categoryValue) - def isProperNoun = domain.isProperNoun(categoryValue) - def isVerb = domain.isVerb(categoryValue) - def isAdjective = domain.isAdjective(categoryValue) - def isPersonalPronoun = domain.isPersonalPronoun(categoryValue) -} - -class LabeledCtbPosTag(token: Token, targetValue: String) extends CtbPosTag(token, targetValue) with CategoricalLabeling[String] diff --git a/src/main/scala/cc/factorie/app/nlp/pos/ForwardPosTagger.scala b/src/main/scala/cc/factorie/app/nlp/pos/ForwardPosTagger.scala index a8d871c..75947e1 100644 --- a/src/main/scala/cc/factorie/app/nlp/pos/ForwardPosTagger.scala +++ b/src/main/scala/cc/factorie/app/nlp/pos/ForwardPosTagger.scala @@ -1,31 +1,21 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ package cc.factorie.app.nlp.pos + import java.io._ -import cc.factorie._ import cc.factorie.app.classify.backend.LinearMulticlassClassifier import cc.factorie.app.nlp._ -import cc.factorie.la._ +import cc.factorie.la.{SparseBinaryTensor1, WeightsMapAccumulator} import cc.factorie.optimize.Trainer import cc.factorie.util._ import cc.factorie.variable.{BinaryFeatureVectorVariable, CategoricalVectorDomain} +import cc.factorie.{Tensor1, la, optimize} /** A part-of-speech tagger that predicts by greedily labeling each word in sequence. - Although it does not use Viterbi, it is surprisingly accurate. It is also fast. - - For the Viterbi-based part-of-speech tagger, see ChainPosTagger. - @author Andrew McCallum, */ + * Although it does not use Viterbi, it is surprisingly accurate. It is also fast. + ** + *For the Viterbi-based part-of-speech tagger, see ChainPosTagger. + * + *@author Andrew McCallum, */ class ForwardPosTagger extends DocumentAnnotator with Serializable { private val logger = Logger.getLogger(this.getClass.getName) @@ -42,11 +32,11 @@ class ForwardPosTagger extends DocumentAnnotator with Serializable { logger.debug("ForwardPosTagger loading from "+url) deserialize(stream) } - + object FeatureDomain extends CategoricalVectorDomain[String] class FeatureVariable(t:Tensor1) extends BinaryFeatureVectorVariable[String] { def domain = FeatureDomain; set(t)(null) } // Only used for printing diagnostics lazy val model = new LinearMulticlassClassifier(PennPosDomain.size, FeatureDomain.dimensionSize) - + /** Local lemmatizer used for POS features. */ protected def lemmatize(string:String): String = cc.factorie.app.strings.replaceDigits(string) /** A special IndexedSeq[String] that will return "null" for indices out of bounds, rather than throwing an error */ @@ -61,8 +51,8 @@ class ForwardPosTagger extends DocumentAnnotator with Serializable { } protected def lemmas(tokens:Seq[Token]) = new Lemmas(tokens) -// This should not be a singleton object, global mutable state is bad -luke -/** Infrastructure for building and remembering a list of training data words that nearly always have the same POS tag. + // This should not be a singleton object, global mutable state is bad -luke + /** Infrastructure for building and remembering a list of training data words that nearly always have the same POS tag. Used as cheap "stacked learning" features when looking-ahead to words not yet predicted by this POS tagger. The key into the ambiguityClasses is app.strings.replaceDigits().toLowerCase */ object WordData { @@ -73,34 +63,34 @@ class ForwardPosTagger extends DocumentAnnotator with Serializable { val wordInclusionThreshold = 1 val sureTokenThreshold = -1 // -1 means don't consider any tokens "sure" - def computeWordFormsByDocumentFrequency(tokens: Iterable[Token], cutoff: Integer, numToksPerDoc: Int) = { + def computeWordFormsByDocumentFrequency(tokens: Iterable[Token], cutoff: Integer, numToksPerDoc: Int) = { var begin = 0 for(i <- numToksPerDoc to tokens.size by numToksPerDoc){ val docTokens = tokens.slice(begin,i) val docUniqueLemmas = docTokens.map(x => lemmatize(x.string).toLowerCase).toSet for(lemma <- docUniqueLemmas){ if (!docWordCounts.contains(lemma)) { - docWordCounts(lemma) = 0 + docWordCounts(lemma) = 0 } docWordCounts(lemma) += 1 } begin = i } - + // deal with last chunk of sentences if(begin < tokens.size){ - val docTokens = tokens.slice(begin,tokens.size) - val docUniqueLemmas = docTokens.map(x => lemmatize(x.string).toLowerCase).toSet - for(lemma <- docUniqueLemmas){ - if (!docWordCounts.contains(lemma)) { - docWordCounts(lemma) = 0 - } - docWordCounts(lemma) += 1 - } + val docTokens = tokens.slice(begin,tokens.size) + val docUniqueLemmas = docTokens.map(x => lemmatize(x.string).toLowerCase).toSet + for(lemma <- docUniqueLemmas){ + if (!docWordCounts.contains(lemma)) { + docWordCounts(lemma) = 0 + } + docWordCounts(lemma) += 1 + } } docWordCounts = docWordCounts.filter(_._2 > cutoff) } - + def computeAmbiguityClasses(tokens: Iterable[Token]) = { val posCounts = collection.mutable.HashMap[String,Array[Int]]() val wordCounts = collection.mutable.HashMap[String,Double]() @@ -133,7 +123,7 @@ class ForwardPosTagger extends DocumentAnnotator with Serializable { }) } } - + def features(token:Token, lemmaIndex:Int, lemmas:Lemmas): SparseBinaryTensor1 = { def lemmaStringAtOffset(offset:Int): String = "L@"+offset+"="+lemmas.docFreqLc(lemmaIndex + offset) // this is lowercased def wordStringAtOffset(offset:Int): String = "W@"+offset+"="+lemmas.docFreq(lemmaIndex + offset) // this is not lowercased, but still has digits replaced @@ -180,16 +170,16 @@ class ForwardPosTagger extends DocumentAnnotator with Serializable { addFeature(wp2) addFeature(wp3) // The paper also includes wp3 and wm3 - + // not in ClearNLP -// addFeature(lp3) -// addFeature(lp2) -// addFeature(lp1) -// addFeature(l0) -// addFeature(lm1) -// addFeature(lm2) -// addFeature(lm3) - + // addFeature(lp3) + // addFeature(lp2) + // addFeature(lp1) + // addFeature(l0) + // addFeature(lm1) + // addFeature(lm2) + // addFeature(lm3) + addFeature(pm3) addFeature(pm2) addFeature(pm1) @@ -205,10 +195,10 @@ class ForwardPosTagger extends DocumentAnnotator with Serializable { addFeature(pm2+pm1) addFeature(ap1+ap2) addFeature(pm1+ap1) - -// addFeature(pm1+a0) // Not in http://www.aclweb.org/anthology-new/P/P12/P12-2071.pdf -// addFeature(a0+ap1) // Not in http://www.aclweb.org/anthology-new/P/P12/P12-2071.pdf - + + // addFeature(pm1+a0) // Not in http://www.aclweb.org/anthology-new/P/P12/P12-2071.pdf + // addFeature(a0+ap1) // Not in http://www.aclweb.org/anthology-new/P/P12/P12-2071.pdf + addFeature(lm2+lm1+l0) addFeature(lm1+l0+lp1) addFeature(l0+lp1+lp2) @@ -218,32 +208,32 @@ class ForwardPosTagger extends DocumentAnnotator with Serializable { addFeature(pm1+a0+ap1) addFeature(pm2+pm1+ap1) addFeature(pm1+ap1+ap2) - -// addFeature(a0+ap1+ap2) // Not in http://www.aclweb.org/anthology-new/P/P12/P12-2071.pdf - + + // addFeature(a0+ap1+ap2) // Not in http://www.aclweb.org/anthology-new/P/P12/P12-2071.pdf + addFeature(takePrefix(W, 1)) addFeature(takePrefix(W, 2)) addFeature(takePrefix(W, 3)) - + // not in ClearNLP -// addFeature("PREFIX2@1="+takePrefix(Wp1, 2)) -// addFeature("PREFIX3@1="+takePrefix(Wp1, 3)) -// addFeature("PREFIX2@2="+takePrefix(Wp2, 2)) -// addFeature("PREFIX3@2="+takePrefix(Wp2, 3)) - + // addFeature("PREFIX2@1="+takePrefix(Wp1, 2)) + // addFeature("PREFIX3@1="+takePrefix(Wp1, 3)) + // addFeature("PREFIX2@2="+takePrefix(Wp2, 2)) + // addFeature("PREFIX3@2="+takePrefix(Wp2, 3)) + addFeature(takeSuffix(W, 1)) addFeature(takeSuffix(W, 2)) addFeature(takeSuffix(W, 3)) addFeature(takeSuffix(W, 4)) - + // not in ClearNLP -// addFeature("SUFFIX1@1="+takeRight(Wp1, 1)) -// addFeature("SUFFIX2@1="+takeRight(Wp1, 2)) -// addFeature("SUFFIX3@1="+takeRight(Wp1, 3)) -// addFeature("SUFFIX4@1="+takeRight(Wp1, 4)) -// addFeature("SUFFIX2@2="+takeRight(Wp2, 2)) -// addFeature("SUFFIX3@2="+takeRight(Wp2, 3)) -// addFeature("SUFFIX4@2="+takeRight(Wp2, 4)) + // addFeature("SUFFIX1@1="+takeRight(Wp1, 1)) + // addFeature("SUFFIX2@1="+takeRight(Wp1, 2)) + // addFeature("SUFFIX3@1="+takeRight(Wp1, 3)) + // addFeature("SUFFIX4@1="+takeRight(Wp1, 4)) + // addFeature("SUFFIX2@2="+takeRight(Wp2, 2)) + // addFeature("SUFFIX3@2="+takeRight(Wp2, 3)) + // addFeature("SUFFIX4@2="+takeRight(Wp2, 4)) addFeature("SHAPE@-2="+cc.factorie.app.strings.stringShape(Wm2, 2)) addFeature("SHAPE@-1="+cc.factorie.app.strings.stringShape(Wm1, 2)) addFeature("SHAPE@0="+cc.factorie.app.strings.stringShape(W, 2)) @@ -276,7 +266,7 @@ class ForwardPosTagger extends DocumentAnnotator with Serializable { } } } - + def predict(tokens: Seq[Token]): Unit = { val lemmaStrings = lemmas(tokens) for (index <- 0 until tokens.length) { @@ -294,7 +284,7 @@ class ForwardPosTagger extends DocumentAnnotator with Serializable { def predict(span: TokenSpan): Unit = predict(span.tokens) def predict(document: Document): Unit = { for (section <- document.sections) - if (section.hasSentences) document.sentences.foreach(predict(_)) // we have Sentence boundaries + if (section.hasSentences) document.sentences.foreach(predict(_)) // we have Sentence boundaries else predict(section.tokens) // we don't // TODO But if we have trained with Sentence boundaries, won't this hurt accuracy? } @@ -331,12 +321,12 @@ class ForwardPosTagger extends DocumentAnnotator with Serializable { BinarySerializer.deserialize(WordData.docWordCounts, dstream) dstream.close() // TODO Are we really supposed to close here, or is that the responsibility of the caller } - + def printAccuracy(sentences: Iterable[Sentence], extraText: String) = { val (tokAcc, senAcc, speed, _) = accuracy(sentences) println(extraText + s"$tokAcc token accuracy, $senAcc sentence accuracy, $speed tokens/sec") } - + def accuracy(sentences:Iterable[Sentence]): (Double, Double, Double, Double) = { var tokenTotal = 0.0 var tokenCorrect = 0.0 @@ -348,7 +338,7 @@ class ForwardPosTagger extends DocumentAnnotator with Serializable { val t0 = System.currentTimeMillis() process(s) //predict(s) totalTime += (System.currentTimeMillis()-t0) - for (token <- s.tokens) { + for (token <- s.tokens) { tokenTotal += 1 if (token.attr[LabeledPennPosTag].valueIsTarget) tokenCorrect += 1.0 else thisSentenceCorrect = 0.0 @@ -359,7 +349,7 @@ class ForwardPosTagger extends DocumentAnnotator with Serializable { val tokensPerSecond = (tokenTotal/totalTime)*1000.0 (tokenCorrect/tokenTotal, sentenceCorrect/sentenceTotal, tokensPerSecond, tokenTotal) } - + def test(sentences:Iterable[Sentence]) = { println("Testing on " + sentences.size + " sentences...") val (tokAccuracy, sentAccuracy, speed, tokens) = accuracy(sentences) @@ -367,25 +357,25 @@ class ForwardPosTagger extends DocumentAnnotator with Serializable { println("Token accuracy: " + tokAccuracy) println("Sentence accuracy: " + sentAccuracy) } - + def train(trainSentences:Seq[Sentence], testSentences:Seq[Sentence], lrate:Double = 0.1, decay:Double = 0.01, cutoff:Int = 2, doBootstrap:Boolean = true, useHingeLoss:Boolean = false, numIterations: Int = 5, l1Factor:Double = 0.000001, l2Factor:Double = 0.000001)(implicit random: scala.util.Random) { // TODO Accomplish this TokenNormalization instead by calling POS3.preProcess //for (sentence <- trainSentences ++ testSentences; token <- sentence.tokens) cc.factorie.app.nlp.segment.PlainTokenNormalizer.processToken(token) - + val toksPerDoc = 5000 WordData.computeWordFormsByDocumentFrequency(trainSentences.flatMap(_.tokens), 1, toksPerDoc) WordData.computeAmbiguityClasses(trainSentences.flatMap(_.tokens)) - + // Prune features by count FeatureDomain.dimensionDomain.gatherCounts = true for (sentence <- trainSentences) features(sentence.tokens) // just to create and count all features FeatureDomain.dimensionDomain.trimBelowCount(cutoff) FeatureDomain.freeze() println("After pruning using %d features.".format(FeatureDomain.dimensionDomain.size)) - + /* Print out some features (for debugging) */ //println("ForwardPosTagger.train\n"+trainSentences(3).tokens.map(_.string).zip(features(trainSentences(3).tokens).map(t => new FeatureVariable(t).toString)).mkString("\n")) - + def evaluate() { exampleSetsToPrediction = doBootstrap printAccuracy(trainSentences, "Training: ") @@ -414,169 +404,8 @@ class ForwardPosTagger extends DocumentAnnotator with Serializable { d } def process(s: Sentence) = { predict(s); s } - def prereqAttrs: Iterable[Class[_]] = List(classOf[Token], classOf[Sentence], classOf[segment.PlainNormalizedTokenString]) + def prereqAttrs: Iterable[Class[_]] = List(classOf[Token], classOf[Sentence], classOf[cc.factorie.app.nlp.segment.PlainNormalizedTokenString]) def postAttrs: Iterable[Class[_]] = List(classOf[PennPosTag]) override def tokenAnnotationString(token:Token): String = { val label = token.attr[PennPosTag]; if (label ne null) label.categoryValue else "(null)" } } -/** The default part-of-speech tagger, trained on Penn Treebank Wall Street Journal, with parameters loaded from resources in the classpath. */ -class WSJForwardPosTagger(url:java.net.URL) extends ForwardPosTagger(url) -object WSJForwardPosTagger extends WSJForwardPosTagger(cc.factorie.util.ClasspathURL[WSJForwardPosTagger](".factorie")) - -/** The default part-of-speech tagger, trained on all Ontonotes training data (including Wall Street Journal), with parameters loaded from resources in the classpath. */ -class OntonotesForwardPosTagger(url:java.net.URL) extends ForwardPosTagger(url) with Serializable -object OntonotesForwardPosTagger extends OntonotesForwardPosTagger(cc.factorie.util.ClasspathURL[OntonotesForwardPosTagger](".factorie")) with Serializable - -class ForwardPosOptions extends cc.factorie.util.DefaultCmdOptions with SharedNLPCmdOptions{ - val modelFile = new CmdOption("model", "", "FILENAME", "Filename for the model (saving a trained model or reading a running model.") - val testFile = new CmdOption("test-file", "", "FILENAME", "OWPL test file.") - val trainFile = new CmdOption("train-file", "", "FILENAME", "OWPL training file.") - val testDir = new CmdOption("test-dir", "", "FILENAME", "Directory containing OWPL test files (.dep.pmd).") - val trainDir = new CmdOption("train-dir", "", "FILENAME", "Directory containing OWPL training files (.dep.pmd).") - val testFiles = new CmdOption("test-files", "", "STRING", "comma-separated list of OWPL test files (.dep.pmd).") - val trainFiles = new CmdOption("train-files", "", "STRING", "comma-separated list of OWPL training files (.dep.pmd).") - val l1 = new CmdOption("l1", 0.000001, "FLOAT", "l1 regularization weight") - val l2 = new CmdOption("l2", 0.00001, "FLOAT", "l2 regularization weight") - val rate = new CmdOption("rate", 1.0, "FLOAT", "base learning rate") - val delta = new CmdOption("delta", 0.1, "FLOAT", "learning rate decay") - val cutoff = new CmdOption("cutoff", 2, "INT", "Discard features less frequent than this before training.") - val updateExamples = new CmdOption("update-examples", true, "BOOL", "Whether to update examples in later iterations during training.") - val useHingeLoss = new CmdOption("use-hinge-loss", false, "BOOL", "Whether to use hinge loss (or log loss) during training.") - val saveModel = new CmdOption("save-model", false, "BOOL", "Whether to save the trained model.") - val runText = new CmdOption("run", "", "FILENAME", "Plain text file on which to run.") - val numIters = new CmdOption("num-iterations", 5, "INT", "number of passes over the data for training") - val owpl = new CmdOption("owpl", false, "BOOL", "Whether the data is in OWPL format or otherwise (Ontonotes)") -} - -/* -object ForwardPosTester { - def main(args: Array[String]) { - val opts = new ForwardPosOptions - opts.parse(args) - assert(opts.testFile.wasInvoked || opts.testDir.wasInvoked || opts.testFiles.wasInvoked) - - // load model from file if given, - // else if the wsj command line param was specified use wsj model, - // otherwise ontonotes model - val pos = { - if(opts.modelFile.wasInvoked) new ForwardPosTagger(new File(opts.modelFile.value)) - else if(opts.owpl.value) WSJForwardPosTagger - else OntonotesForwardPosTagger - } - - assert(!(opts.testDir.wasInvoked && opts.testFiles.wasInvoked)) - var testFileList = Seq(opts.testFile.value) - if(opts.testDir.wasInvoked){ - testFileList = FileUtils.getFileListFromDir(opts.testDir.value) - }else if (opts.testFiles.wasInvoked){ - testFileList = opts.testFiles.value.split(",") - } - - val testPortionToTake = if(opts.testPortion.wasInvoked) opts.testPortion.value else 1.0 - val testDocs = testFileList.map(fname => { - if(opts.owpl.value) load.LoadOWPL.fromFilename(fname, pennPosLabelMaker).head - else load.LoadOntonotes5.fromFilename(fname).head - }) - val testSentencesFull = testDocs.flatMap(_.sentences) - val testSentences = testSentencesFull.take((testPortionToTake*testSentencesFull.length).floor.toInt) - - pos.test(testSentences) - } -} -*/ -/* -object ForwardPosTrainer extends HyperparameterMain { - def evaluateParameters(args: Array[String]): Double = { - implicit val random = new scala.util.Random(0) - val opts = new ForwardPosOptions - opts.parse(args) - assert(opts.trainFile.wasInvoked || opts.trainDir.wasInvoked || opts.trainFiles.wasInvoked) - // Expects three command-line arguments: a train file, a test file, and a place to save the model - // the train and test files are supposed to be in OWPL format - val pos = new ForwardPosTagger - - assert(!(opts.trainDir.wasInvoked && opts.trainFiles.wasInvoked)) - var trainFileList = Seq(opts.trainFile.value) - if(opts.trainDir.wasInvoked){ - trainFileList = FileUtils.getFileListFromDir(opts.trainDir.value) - } else if (opts.trainFiles.wasInvoked){ - trainFileList = opts.trainFiles.value.split(",") - } - - assert(!(opts.testDir.wasInvoked && opts.testFiles.wasInvoked)) - var testFileList = Seq(opts.testFile.value) - if(opts.testDir.wasInvoked){ - testFileList = FileUtils.getFileListFromDir(opts.testDir.value) - }else if (opts.testFiles.wasInvoked){ - testFileList = opts.testFiles.value.split(",") - } - - val trainDocs = trainFileList.map(fname => { - if(opts.owpl.value) load.LoadOWPL.fromFilename(fname, pennPosLabelMaker).head - else load.LoadOntonotes5.fromFilename(fname).head - }) - val testDocs = testFileList.map(fname => { - if(opts.owpl.value) load.LoadOWPL.fromFilename(fname, pennPosLabelMaker).head - else load.LoadOntonotes5.fromFilename(fname).head - }) - - //for (d <- trainDocs) println("POS3.train 1 trainDoc.length="+d.length) - println("Read %d training tokens from %d files.".format(trainDocs.map(_.tokenCount).sum, trainDocs.size)) - println("Read %d testing tokens from %d files.".format(testDocs.map(_.tokenCount).sum, testDocs.size)) - - val trainPortionToTake = if(opts.trainPortion.wasInvoked) opts.trainPortion.value else 1.0 - val testPortionToTake = if(opts.testPortion.wasInvoked) opts.testPortion.value else 1.0 - val trainSentencesFull = trainDocs.flatMap(_.sentences) - val trainSentences = trainSentencesFull.take((trainPortionToTake*trainSentencesFull.length).floor.toInt) - val testSentencesFull = testDocs.flatMap(_.sentences) - val testSentences = testSentencesFull.take((testPortionToTake*testSentencesFull.length).floor.toInt) - - pos.train(trainSentences, testSentences, - opts.rate.value, opts.delta.value, opts.cutoff.value, opts.updateExamples.value, opts.useHingeLoss.value, numIterations=opts.numIters.value.toInt,l1Factor=opts.l1.value, l2Factor=opts.l2.value) - if (opts.saveModel.value) { - pos.serialize(opts.modelFile.value) - val pos2 = new ForwardPosTagger - pos2.deserialize(new java.io.File(opts.modelFile.value)) - pos.printAccuracy(testDocs.flatMap(_.sentences), "pre-serialize accuracy: ") - pos2.printAccuracy(testDocs.flatMap(_.sentences), "post-serialize accuracy: ") - } - val acc = pos.accuracy(testDocs.flatMap(_.sentences))._1 - if(opts.targetAccuracy.wasInvoked) cc.factorie.assertMinimalAccuracy(acc,opts.targetAccuracy.value.toDouble) - acc - } -} -*/ -/* -object ForwardPosOptimizer { - def main(args: Array[String]) { - val opts = new ForwardPosOptions - opts.parse(args) - opts.saveModel.setValue(false) - val l1 = cc.factorie.util.HyperParameter(opts.l1, new cc.factorie.util.LogUniformDoubleSampler(1e-10, 1e2)) - val l2 = cc.factorie.util.HyperParameter(opts.l2, new cc.factorie.util.LogUniformDoubleSampler(1e-10, 1e2)) - val rate = cc.factorie.util.HyperParameter(opts.rate, new cc.factorie.util.LogUniformDoubleSampler(1e-4, 1e4)) - val delta = cc.factorie.util.HyperParameter(opts.delta, new cc.factorie.util.LogUniformDoubleSampler(1e-4, 1e4)) - val cutoff = cc.factorie.util.HyperParameter(opts.cutoff, new cc.factorie.util.SampleFromSeq(List(0,1,2,3))) - val iters = cc.factorie.util.HyperParameter(opts.numIters, new cc.factorie.util.SampleFromSeq(List(3,5,7))) - /* - val ssh = new cc.factorie.util.SSHActorExecutor("apassos", - Seq("avon1", "avon2"), - "/home/apassos/canvas/factorie-test", - "try-log/", - "cc.factorie.app.nlp.parse.DepParser2", - 10, 5) - */ - val qs = new cc.factorie.util.QSubExecutor(16, "cc.factorie.app.nlp.pos.ForwardPosTrainer") - val optimizer = new cc.factorie.util.HyperParameterSearcher(opts, Seq(l1, l2, rate, delta, cutoff, iters), qs.execute, 200, 180, 60) - val result = optimizer.optimize() - println("Got results: " + result.mkString(" ")) - println("Best l1: " + opts.l1.value + " best l2: " + opts.l2.value) - opts.saveModel.setValue(true) - println("Running best configuration...") - import scala.concurrent.Await - import scala.concurrent.duration._ - Await.result(qs.execute(opts.values.flatMap(_.unParse).toArray), 5.hours) - println("Done") - } -} -*/ diff --git a/src/main/scala/cc/factorie/app/nlp/pos/LabeledPennPosTag.scala b/src/main/scala/cc/factorie/app/nlp/pos/LabeledPennPosTag.scala new file mode 100644 index 0000000..7a1e487 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/pos/LabeledPennPosTag.scala @@ -0,0 +1,8 @@ +package cc.factorie.app.nlp.pos + +import cc.factorie.app.nlp.Token +import cc.factorie.variable.CategoricalLabeling + +/** A categorical variable, associated with a token, holding its Penn Treebank part-of-speech category, + * which also separately holds its desired correct "target" value. */ +class LabeledPennPosTag(token:Token, targetValue:String) extends PennPosTag(token, targetValue) with CategoricalLabeling[String] with Serializable diff --git a/src/main/scala/cc/factorie/app/nlp/pos/LabeledUniversalPosTag.scala b/src/main/scala/cc/factorie/app/nlp/pos/LabeledUniversalPosTag.scala new file mode 100644 index 0000000..299d085 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/pos/LabeledUniversalPosTag.scala @@ -0,0 +1,9 @@ +package cc.factorie.app.nlp.pos + +import cc.factorie.app.nlp.Token +import cc.factorie.variable.CategoricalLabeling + +/** A categorical variable, associated with a token, holding its Google Universal part-of-speech category, + * which also separately holds its desired correct "target" value. */ +class LabeledUniversalPosTag(token:Token, targetValue:String) extends UniversalPosTag(token, targetValue) with CategoricalLabeling[String] + diff --git a/src/main/scala/cc/factorie/app/nlp/pos/OntoNotesForwardPosTagger.scala b/src/main/scala/cc/factorie/app/nlp/pos/OntoNotesForwardPosTagger.scala new file mode 100644 index 0000000..b2ea55f --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/pos/OntoNotesForwardPosTagger.scala @@ -0,0 +1,9 @@ +package cc.factorie.app.nlp.pos + +import java.io.Serializable + + + +/** The default part-of-speech tagger, trained on all Ontonotes training data (including Wall Street Journal), with parameters loaded from resources in the classpath. */ +class OntonotesForwardPosTagger(url:java.net.URL) extends ForwardPosTagger(url) with Serializable +object OntonotesForwardPosTagger extends OntonotesForwardPosTagger(cc.factorie.util.ClasspathURL[OntonotesForwardPosTagger](".factorie")) with Serializable diff --git a/src/main/scala/cc/factorie/app/nlp/pos/PennPosDomain.scala b/src/main/scala/cc/factorie/app/nlp/pos/PennPosDomain.scala new file mode 100644 index 0000000..b17fac2 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/pos/PennPosDomain.scala @@ -0,0 +1,76 @@ +package cc.factorie.app.nlp.pos + +import cc.factorie.variable.CategoricalDomain + +/** Penn Treebank part-of-speech tag domain. */ +object PennPosDomain extends CategoricalDomain[String] { + this ++= Vector( + "#", // In WSJ but not in Ontonotes + "$", + "''", + ",", + "-LRB-", + "-RRB-", + ".", + ":", + "CC", + "CD", + "DT", + "EX", + "FW", + "IN", + "JJ", + "JJR", + "JJS", + "LS", + "MD", + "NN", + "NNP", + "NNPS", + "NNS", + "PDT", + "POS", + "PRP", + "PRP$", + "PUNC", + "RB", + "RBR", + "RBS", + "RP", + "SYM", + "TO", + "UH", + "VB", + "VBD", + "VBG", + "VBN", + "VBP", + "VBZ", + "WDT", + "WP", + "WP$", + "WRB", + "``", + "ADD", // in Ontonotes, but not WSJ + "AFX", // in Ontonotes, but not WSJ + "HYPH", // in Ontonotes, but not WSJ + "NFP", // in Ontonotes, but not WSJ + "XX" // in Ontonotes, but not WSJ + ) + freeze() + // Short-cuts for a few commonly-queried tags + val posIndex = index("POS") + val nnpIndex = index("NNP") + val nnpsIndex = index("NNPS") + val prpIndex = index("PRP") + val prpdIndex = index("PRP$") + val wpIndex = index("WP") + val wpdIndex = index("WP$") + val ccIndex = index("CC") + + def isNoun(pos:String): Boolean = pos(0) == 'N' + def isProperNoun(pos:String) = { pos == "NNP" || pos == "NNPS" } + def isVerb(pos:String) = pos(0) == 'V' + def isAdjective(pos:String) = pos(0) == 'J' + def isPersonalPronoun(pos: String) = pos == "PRP" +} \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/nlp/pos/PennPosTag.scala b/src/main/scala/cc/factorie/app/nlp/pos/PennPosTag.scala new file mode 100644 index 0000000..28ac18a --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/pos/PennPosTag.scala @@ -0,0 +1,16 @@ +package cc.factorie.app.nlp.pos + +import cc.factorie.app.nlp.Token + +/** A categorical variable, associated with a token, holding its Penn Treebank part-of-speech category. */ +class PennPosTag(token:Token, initialIndex:Int) + extends PosTag(token, initialIndex) with Serializable { + def this(token:Token, initialCategory:String) = this(token, PennPosDomain.index(initialCategory)) + final def domain = PennPosDomain + def isNoun = PennPosDomain.isNoun(categoryValue) + def isProperNoun = PennPosDomain.isProperNoun(categoryValue) + def isVerb = PennPosDomain.isVerb(categoryValue) + def isAdjective = PennPosDomain.isAdjective(categoryValue) + def isPersonalPronoun = PennPosDomain.isPersonalPronoun(categoryValue) +} + diff --git a/src/main/scala/cc/factorie/app/nlp/pos/PosTag.scala b/src/main/scala/cc/factorie/app/nlp/pos/PosTag.scala index d2de50d..99704f0 100644 --- a/src/main/scala/cc/factorie/app/nlp/pos/PosTag.scala +++ b/src/main/scala/cc/factorie/app/nlp/pos/PosTag.scala @@ -1,251 +1,10 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - package cc.factorie.app.nlp.pos -import cc.factorie.app.nlp._ -import cc.factorie.variable._ - -abstract class PosTag(val token:Token, initialIndex:Int) extends CategoricalVariable[String](initialIndex) - -/** Penn Treebank part-of-speech tag domain. */ -object PennPosDomain extends CategoricalDomain[String] { - this ++= Vector( - "#", // In WSJ but not in Ontonotes - "$", - "''", - ",", - "-LRB-", - "-RRB-", - ".", - ":", - "CC", - "CD", - "DT", - "EX", - "FW", - "IN", - "JJ", - "JJR", - "JJS", - "LS", - "MD", - "NN", - "NNP", - "NNPS", - "NNS", - "PDT", - "POS", - "PRP", - "PRP$", - "PUNC", - "RB", - "RBR", - "RBS", - "RP", - "SYM", - "TO", - "UH", - "VB", - "VBD", - "VBG", - "VBN", - "VBP", - "VBZ", - "WDT", - "WP", - "WP$", - "WRB", - "``", - "ADD", // in Ontonotes, but not WSJ - "AFX", // in Ontonotes, but not WSJ - "HYPH", // in Ontonotes, but not WSJ - "NFP", // in Ontonotes, but not WSJ - "XX" // in Ontonotes, but not WSJ - ) - freeze() - // Short-cuts for a few commonly-queried tags - val posIndex = index("POS") - val nnpIndex = index("NNP") - val nnpsIndex = index("NNPS") - val prpIndex = index("PRP") - val prpdIndex = index("PRP$") - val wpIndex = index("WP") - val wpdIndex = index("WP$") - val ccIndex = index("CC") - def isNoun(pos:String): Boolean = pos(0) == 'N' - def isProperNoun(pos:String) = { pos == "NNP" || pos == "NNPS" } - def isVerb(pos:String) = pos(0) == 'V' - def isAdjective(pos:String) = pos(0) == 'J' - def isPersonalPronoun(pos: String) = pos == "PRP" -} -/** A categorical variable, associated with a token, holding its Penn Treebank part-of-speech category. */ -class PennPosTag(token:Token, initialIndex:Int) - extends PosTag(token, initialIndex) with Serializable { - def this(token:Token, initialCategory:String) = this(token, PennPosDomain.index(initialCategory)) - final def domain = PennPosDomain - def isNoun = PennPosDomain.isNoun(categoryValue) - def isProperNoun = PennPosDomain.isProperNoun(categoryValue) - def isVerb = PennPosDomain.isVerb(categoryValue) - def isAdjective = PennPosDomain.isAdjective(categoryValue) - def isPersonalPronoun = PennPosDomain.isPersonalPronoun(categoryValue) -} -/** A categorical variable, associated with a token, holding its Penn Treebank part-of-speech category, - which also separately holds its desired correct "target" value. */ -class LabeledPennPosTag(token:Token, targetValue:String) - extends PennPosTag(token, targetValue) with CategoricalLabeling[String] with Serializable +import cc.factorie.app.nlp.Token +import cc.factorie.variable.CategoricalVariable - -/** The "A Universal Part-of-Speech Tagset" - by Slav Petrov, Dipanjan Das and Ryan McDonald - http://arxiv.org/abs/1104.2086 - http://code.google.com/p/universal-pos-tags - - VERB - verbs (all tenses and modes) - NOUN - nouns (common and proper) - PRON - pronouns - ADJ - adjectives - ADV - adverbs - ADP - adpositions (prepositions and postpositions) - CONJ - conjunctions - DET - determiners - NUM - cardinal numbers - PRT - particles or other function words - X - other: foreign words, typos, abbreviations - . - punctuation +/** + * Created by andrew@andrewresearch.net on 27/10/17. */ -object UniversalPosDomain extends EnumDomain { - this ++= Vector("VERB", "NOUN", "PRON", "ADJ", "ADV", "ADP", "CONJ", "DET", "NUM", "PRT", "X", ".") - freeze() - private val Penn2universal = new scala.collection.mutable.HashMap[String,String] ++= Vector( - "!" -> ".", - "#" -> ".", - "$" -> ".", - "''" -> ".", - "(" -> ".", - ")" -> ".", - "," -> ".", - "-LRB-" -> ".", - "-RRB-" -> ".", - "." -> ".", - ":" -> ".", - "?" -> ".", - "CC" -> "CONJ", - "CD" -> "NUM", - "CD|RB" -> "X", - "DT" -> "DET", - "EX"-> "DET", - "FW" -> "X", - "IN" -> "ADP", - "IN|RP" -> "ADP", - "JJ" -> "ADJ", - "JJR" -> "ADJ", - "JJRJR" -> "ADJ", - "JJS" -> "ADJ", - "JJ|RB" -> "ADJ", - "JJ|VBG" -> "ADJ", - "LS" -> "X", - "MD" -> "VERB", - "NN" -> "NOUN", - "NNP" -> "NOUN", - "NNPS" -> "NOUN", - "NNS" -> "NOUN", - "NN|NNS" -> "NOUN", - "NN|SYM" -> "NOUN", - "NN|VBG" -> "NOUN", - "NP" -> "NOUN", - "PDT" -> "DET", - "POS" -> "PRT", - "PRP" -> "PRON", - "PRP$" -> "PRON", - "PRP|VBP" -> "PRON", - "PRT" -> "PRT", - "RB" -> "ADV", - "RBR" -> "ADV", - "RBS" -> "ADV", - "RB|RP" -> "ADV", - "RB|VBG" -> "ADV", - "RN" -> "X", - "RP" -> "PRT", - "SYM" -> "X", - "TO" -> "PRT", - "UH" -> "X", - "VB" -> "VERB", - "VBD" -> "VERB", - "VBD|VBN" -> "VERB", - "VBG" -> "VERB", - "VBG|NN" -> "VERB", - "VBN" -> "VERB", - "VBP" -> "VERB", - "VBP|TO" -> "VERB", - "VBZ" -> "VERB", - "VP" -> "VERB", - "WDT" -> "DET", - "WH" -> "X", - "WP" -> "PRON", - "WP$" -> "PRON", - "WRB" -> "ADV", - "``" -> ".") - def categoryFromPenn(PennPosCategory:String): String = Penn2universal(PennPosCategory) -} - -/** A categorical variable, associated with a token, holding its Google Universal part-of-speech category. */ -class UniversalPosTag(val token:Token, initialValue:String) extends CategoricalVariable(initialValue) { - def this(token:Token, other:PennPosTag) = this(token, UniversalPosDomain.categoryFromPenn(other.categoryValue)) - def domain = UniversalPosDomain -} -/** A categorical variable, associated with a token, holding its Google Universal part-of-speech category, - which also separately holds its desired correct "target" value. */ -class LabeledUniversalPosTag(token:Token, targetValue:String) extends UniversalPosTag(token, targetValue) with CategoricalLabeling[String] - - -/** Penn Treebank part-of-speech tag domain. */ -object SpanishPosDomain extends CategoricalDomain[String] { - this ++= Vector( - "a", // adjective - "c", // conjunction - "d", // determiner - "f", // punctuation - "i", // interjection - "n", // noun - "p", // pronoun - "r", // adverb - "s", // preposition - "v", // verb - "w", // date - "z", // number - "_" // unknown - ) - freeze() - - def isNoun(pos:String): Boolean = pos(0) == 'n' -// def isProperNoun(pos:String) = { pos == "NNP" || pos == "NNPS" } - def isVerb(pos:String) = pos(0) == 'v' - def isAdjective(pos:String) = pos(0) == 'a' -// def isPersonalPronoun(pos: String) = pos == "PRP" -} -/** A categorical variable, associated with a token, holding its Penn Treebank part-of-speech category. */ -class SpanishPosTag(token:Token, initialIndex:Int) extends PosTag(token, initialIndex) { - def this(token:Token, initialCategory:String) = this(token, SpanishPosDomain.index(initialCategory)) - final def domain = SpanishPosDomain - def isNoun = SpanishPosDomain.isNoun(categoryValue) -// def isProperNoun = SpanishPosDomain.isProperNoun(categoryValue) - def isVerb = SpanishPosDomain.isVerb(categoryValue) - def isAdjective = SpanishPosDomain.isAdjective(categoryValue) -// def isPersonalPronoun = SpanishPosDomain.isPersonalPronoun(categoryValue) -} - -/** A categorical variable, associated with a token, holding its Spanish Treebank part-of-speech category, - which also separately holds its desired correct "target" value. */ -class LabeledSpanishPosTag(token:Token, targetValue:String) extends SpanishPosTag(token, targetValue) with CategoricalLabeling[String] +abstract class PosTag(val token:Token, initialIndex:Int) extends CategoricalVariable[String](initialIndex) \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/nlp/pos/UniversalPosDomain.scala b/src/main/scala/cc/factorie/app/nlp/pos/UniversalPosDomain.scala new file mode 100644 index 0000000..099198e --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/pos/UniversalPosDomain.scala @@ -0,0 +1,97 @@ +package cc.factorie.app.nlp.pos + +import cc.factorie.variable.EnumDomain + + +/** The "A Universal Part-of-Speech Tagset" + * by Slav Petrov, Dipanjan Das and Ryan McDonald + * http://arxiv.org/abs/1104.2086 + * http://code.google.com/p/universal-pos-tags + ** + *VERB - verbs (all tenses and modes) + *NOUN - nouns (common and proper) + *PRON - pronouns + *ADJ - adjectives + *ADV - adverbs + *ADP - adpositions (prepositions and postpositions) + *CONJ - conjunctions + *DET - determiners + *NUM - cardinal numbers + *PRT - particles or other function words + *X - other: foreign words, typos, abbreviations + *. - punctuation + */ +object UniversalPosDomain extends EnumDomain { + this ++= Vector("VERB", "NOUN", "PRON", "ADJ", "ADV", "ADP", "CONJ", "DET", "NUM", "PRT", "X", ".") + freeze() + private val Penn2universal = new scala.collection.mutable.HashMap[String,String] ++= Vector( + "!" -> ".", + "#" -> ".", + "$" -> ".", + "''" -> ".", + "(" -> ".", + ")" -> ".", + "," -> ".", + "-LRB-" -> ".", + "-RRB-" -> ".", + "." -> ".", + ":" -> ".", + "?" -> ".", + "CC" -> "CONJ", + "CD" -> "NUM", + "CD|RB" -> "X", + "DT" -> "DET", + "EX"-> "DET", + "FW" -> "X", + "IN" -> "ADP", + "IN|RP" -> "ADP", + "JJ" -> "ADJ", + "JJR" -> "ADJ", + "JJRJR" -> "ADJ", + "JJS" -> "ADJ", + "JJ|RB" -> "ADJ", + "JJ|VBG" -> "ADJ", + "LS" -> "X", + "MD" -> "VERB", + "NN" -> "NOUN", + "NNP" -> "NOUN", + "NNPS" -> "NOUN", + "NNS" -> "NOUN", + "NN|NNS" -> "NOUN", + "NN|SYM" -> "NOUN", + "NN|VBG" -> "NOUN", + "NP" -> "NOUN", + "PDT" -> "DET", + "POS" -> "PRT", + "PRP" -> "PRON", + "PRP$" -> "PRON", + "PRP|VBP" -> "PRON", + "PRT" -> "PRT", + "RB" -> "ADV", + "RBR" -> "ADV", + "RBS" -> "ADV", + "RB|RP" -> "ADV", + "RB|VBG" -> "ADV", + "RN" -> "X", + "RP" -> "PRT", + "SYM" -> "X", + "TO" -> "PRT", + "UH" -> "X", + "VB" -> "VERB", + "VBD" -> "VERB", + "VBD|VBN" -> "VERB", + "VBG" -> "VERB", + "VBG|NN" -> "VERB", + "VBN" -> "VERB", + "VBP" -> "VERB", + "VBP|TO" -> "VERB", + "VBZ" -> "VERB", + "VP" -> "VERB", + "WDT" -> "DET", + "WH" -> "X", + "WP" -> "PRON", + "WP$" -> "PRON", + "WRB" -> "ADV", + "``" -> ".") + def categoryFromPenn(PennPosCategory:String): String = Penn2universal(PennPosCategory) +} diff --git a/src/main/scala/cc/factorie/app/nlp/pos/UniversalPosTag.scala b/src/main/scala/cc/factorie/app/nlp/pos/UniversalPosTag.scala new file mode 100644 index 0000000..ee1af6e --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/pos/UniversalPosTag.scala @@ -0,0 +1,10 @@ +package cc.factorie.app.nlp.pos + +import cc.factorie.app.nlp.Token +import cc.factorie.variable.CategoricalVariable + +/** A categorical variable, associated with a token, holding its Google Universal part-of-speech category. */ +class UniversalPosTag(val token:Token, initialValue:String) extends CategoricalVariable(initialValue) { + def this(token:Token, other:PennPosTag) = this(token, UniversalPosDomain.categoryFromPenn(other.categoryValue)) + def domain = UniversalPosDomain +} \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/nlp/pos/package.scala b/src/main/scala/cc/factorie/app/nlp/pos/package.scala index 5a4a8bb..89e52b3 100644 --- a/src/main/scala/cc/factorie/app/nlp/pos/package.scala +++ b/src/main/scala/cc/factorie/app/nlp/pos/package.scala @@ -10,8 +10,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -package cc.factorie.app.nlp +package cc.factorie.app +import cc.factorie.app.nlp.Token +import cc.factorie.app.nlp.pos.LabeledPennPosTag import cc.factorie.variable.MutableCategoricalVar package object pos { diff --git a/src/main/scala/cc/factorie/app/nlp/relation/ConllPatternBasedRelationFinder.scala b/src/main/scala/cc/factorie/app/nlp/relation/ConllPatternBasedRelationFinder.scala new file mode 100644 index 0000000..7f40839 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/relation/ConllPatternBasedRelationFinder.scala @@ -0,0 +1,3 @@ +package cc.factorie.app.nlp.relation + +object ConllPatternBasedRelationFinder extends PatternBasedRelationFinder(PatternRelationPredictor.predictorsFromStreams(getClass.getResourceAsStream("/cc/factorie/app/nlp/relation/patterns.tuned"), getClass.getResourceAsStream("/cc/factorie/app/nlp/relation/argtypes_conll"))) diff --git a/src/main/scala/cc/factorie/app/nlp/relation/OntoNotesPatternBasedRelationFinder.scala b/src/main/scala/cc/factorie/app/nlp/relation/OntoNotesPatternBasedRelationFinder.scala new file mode 100644 index 0000000..29dcc64 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/relation/OntoNotesPatternBasedRelationFinder.scala @@ -0,0 +1,8 @@ +package cc.factorie.app.nlp.relation + +/** + * Created by andrew@andrewresearch.net on 28/10/17. + */ + +object OntoNotesPatternBasedRelationFinder extends PatternBasedRelationFinder(PatternRelationPredictor.predictorsFromStreams(getClass.getResourceAsStream("/cc/factorie/app/nlp/relation/patterns.tuned"), getClass.getResourceAsStream("/cc/factorie/app/nlp/relation/argtypes_ontonotes"))) + diff --git a/src/main/scala/cc/factorie/app/nlp/relation/PatterRelationPredictor.scala b/src/main/scala/cc/factorie/app/nlp/relation/PatterRelationPredictor.scala new file mode 100644 index 0000000..dc90c42 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/relation/PatterRelationPredictor.scala @@ -0,0 +1,47 @@ +package cc.factorie.app.nlp.relation + +import java.io.InputStream + +import scala.io.Source + +case class PatternRelationPredictor(relation : String, patternConfidences : Map[String, Double], qTypes : Set[String], + sTypes : Set[String]) { + + val ARG1 = "$ARG1" + val ARG2 = "$ARG2" + + + /** The first boolean indicates if the relation holds in the forward direction (arg1 first) the second if it holds in the reverse */ + def relationMatch(rm : RelationMention) : Double = { + val arg1End = rm.arg1.phrase.last.positionInSentence + val arg2Start = rm.arg2.phrase.head.positionInSentence + + + val forwardPattern = ARG1 + " " + rm.arg1.phrase.sentence.slice(arg1End + 1, arg2Start).map(_.string).mkString(" ") + " " + ARG2 + val backwardPattern = ARG2 + " " + rm.arg1.phrase.sentence.slice(arg1End + 1, arg2Start).map(_.string).mkString(" ") + " " + ARG1 + + val pattern = if(rm.isArg1First) forwardPattern else backwardPattern + + val arg1Type = rm.arg1.phrase.head.nerTag.baseCategoryValue + val arg2Type = rm.arg2.phrase.head.nerTag.baseCategoryValue + val hasMatch = qTypes.contains(arg1Type) && sTypes.contains(arg2Type) && patternConfidences.contains(pattern) + if(hasMatch) patternConfidences(pattern) else 0.0 + } +} + +object PatternRelationPredictor { + def predictorsFromStreams(patternStream:InputStream, typeFileStream:InputStream):Seq[PatternRelationPredictor] = { + + val relToPats = Source.fromInputStream(patternStream, "UTF8").getLines.map(_.stripLineEnd.split(" ", 3)). + map(fields => fields(1) -> (fields(2), fields(0).toDouble)).toList.groupBy(_._1).map { case (k,v) => (k,v.map(_._2).toMap)} + + // reads types from a white-space & comma-separted file of the form: + // relation arg1type,arg1type... arg2type,arg2type + // Types of ontonotes domain described here: http://catalog.ldc.upenn.edu/docs/LDC2008T04/OntoNotes-Release-2.0.pdf + val relToTypes = Source.fromInputStream(typeFileStream, "UTF8").getLines.map(_.stripLineEnd.split(" ", 3)). + map(fields => fields(0) -> (fields(1).split(',').toSet, fields(2).split(',').toSet)).toList + for ((rel, (arg1types, arg2types)) <- relToTypes) yield + new PatternRelationPredictor(rel, relToPats.getOrElse(rel, Map.empty[String, Double]), arg1types, arg2types) + } +} + diff --git a/src/main/scala/cc/factorie/app/nlp/relation/PatternBasedRelationFinder.scala b/src/main/scala/cc/factorie/app/nlp/relation/PatternBasedRelationFinder.scala index e5290ff..044b66a 100644 --- a/src/main/scala/cc/factorie/app/nlp/relation/PatternBasedRelationFinder.scala +++ b/src/main/scala/cc/factorie/app/nlp/relation/PatternBasedRelationFinder.scala @@ -1,27 +1,11 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ package cc.factorie.app.nlp.relation -import java.io.InputStream - -import cc.factorie.app.nlp._ import cc.factorie.app.nlp.coref.{ParseForwardCoref, WithinDocCoref} - -import scala.io.Source +import cc.factorie.app.nlp.{Document, DocumentAnnotator, Token, TokenSpan} /** - * @author John Sullivan, Benjamin Roth - */ + * @author John Sullivan, Benjamin Roth + */ class PatternBasedRelationFinder(predictors:Seq[PatternRelationPredictor]) extends DocumentAnnotator{ def tokenAnnotationString(token: Token) = null @@ -41,9 +25,9 @@ class PatternBasedRelationFinder(predictors:Seq[PatternRelationPredictor]) exten val mentionGrouping = (0 until mentions.size).map(idx => mentions.slice(idx, math.min(idx + 4, mentions.size))).dropRight(1).toList val relationMentions = (for(m1 :: ms <- mentionGrouping; - m2 <- ms; - if ((m1.phrase.sentence eq m2.phrase.sentence) && (m1.phrase.sentence.length < 100))) - yield {Seq(new RelationMention(m1, m2, true), new RelationMention(m2, m1, false))}).flatten + m2 <- ms; + if ((m1.phrase.sentence eq m2.phrase.sentence) && (m1.phrase.sentence.length < 100))) + yield {Seq(new RelationMention(m1, m2, true), new RelationMention(m2, m1, false))}).flatten for (rm <- relationMentions; predictor <- predictors; @@ -57,49 +41,4 @@ class PatternBasedRelationFinder(predictors:Seq[PatternRelationPredictor]) exten doc.attr += relSet doc } -} - -object OntoNotesPatternBasedRelationFinder extends PatternBasedRelationFinder(PatternRelationPredictor.predictorsFromStreams(getClass.getResourceAsStream("/cc/factorie/app/nlp/relation/patterns.tuned"), getClass.getResourceAsStream("/cc/factorie/app/nlp/relation/argtypes_ontonotes"))) -object ConllPatternBasedRelationFinder extends PatternBasedRelationFinder(PatternRelationPredictor.predictorsFromStreams(getClass.getResourceAsStream("/cc/factorie/app/nlp/relation/patterns.tuned"), getClass.getResourceAsStream("/cc/factorie/app/nlp/relation/argtypes_conll"))) - - -case class PatternRelationPredictor(relation : String, patternConfidences : Map[String, Double], qTypes : Set[String], - sTypes : Set[String]) { - - val ARG1 = "$ARG1" - val ARG2 = "$ARG2" - - - /** The first boolean indicates if the relation holds in the forward direction (arg1 first) the second if it holds in the reverse */ - def relationMatch(rm : RelationMention) : Double = { - val arg1End = rm.arg1.phrase.last.positionInSentence - val arg2Start = rm.arg2.phrase.head.positionInSentence - - - val forwardPattern = ARG1 + " " + rm.arg1.phrase.sentence.slice(arg1End + 1, arg2Start).map(_.string).mkString(" ") + " " + ARG2 - val backwardPattern = ARG2 + " " + rm.arg1.phrase.sentence.slice(arg1End + 1, arg2Start).map(_.string).mkString(" ") + " " + ARG1 - - val pattern = if(rm.isArg1First) forwardPattern else backwardPattern - - val arg1Type = rm.arg1.phrase.head.nerTag.baseCategoryValue - val arg2Type = rm.arg2.phrase.head.nerTag.baseCategoryValue - val hasMatch = qTypes.contains(arg1Type) && sTypes.contains(arg2Type) && patternConfidences.contains(pattern) - if(hasMatch) patternConfidences(pattern) else 0.0 - } -} - -object PatternRelationPredictor { - def predictorsFromStreams(patternStream:InputStream, typeFileStream:InputStream):Seq[PatternRelationPredictor] = { - - val relToPats = Source.fromInputStream(patternStream, "UTF8").getLines.map(_.stripLineEnd.split(" ", 3)). - map(fields => fields(1) -> (fields(2), fields(0).toDouble)).toList.groupBy(_._1).map { case (k,v) => (k,v.map(_._2).toMap)} - - // reads types from a white-space & comma-separted file of the form: - // relation arg1type,arg1type... arg2type,arg2type - // Types of ontonotes domain described here: http://catalog.ldc.upenn.edu/docs/LDC2008T04/OntoNotes-Release-2.0.pdf - val relToTypes = Source.fromInputStream(typeFileStream, "UTF8").getLines.map(_.stripLineEnd.split(" ", 3)). - map(fields => fields(0) -> (fields(1).split(',').toSet, fields(2).split(',').toSet)).toList - for ((rel, (arg1types, arg2types)) <- relToTypes) yield - new PatternRelationPredictor(rel, relToPats.getOrElse(rel, Map.empty[String, Double]), arg1types, arg2types) - } -} +} \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/nlp/relation/Relation.scala b/src/main/scala/cc/factorie/app/nlp/relation/Relation.scala deleted file mode 100644 index a41b242..0000000 --- a/src/main/scala/cc/factorie/app/nlp/relation/Relation.scala +++ /dev/null @@ -1,147 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.app.nlp.relation - -import java.io.FileInputStream - -import cc.factorie.app.nlp.coref.ParseForwardCoref -import cc.factorie.app.nlp.load.LoadOWPL -import cc.factorie.app.nlp.ner.{NerTag, NoEmbeddingsConllStackedChainNer} -import cc.factorie.app.nlp.parse.OntonotesTransitionBasedParser -import cc.factorie.app.nlp.phrase.Phrase -import cc.factorie.app.nlp.pos.OntonotesForwardPosTagger -import cc.factorie.app.nlp.{Document, DocumentAnnotatorPipeline, Token, TokenSpan} -import cc.factorie.variable.{CategoricalDomain, MutableCategoricalVar} - -import scala.collection.mutable -import scala.io.Source - -/** - * @author John Sullivan - */ -/* -object Relation { - - def main(args:Array[String]) { - - val doc = new Document(Source.fromFile(args(0)).getLines().mkString("\n")).setName(args(0).split("""/""").last) - - val relFinder = if(args.length >= 3) { - new PatternBasedRelationFinder(PatternRelationPredictor.predictorsFromStreams(new FileInputStream(args(1)), new FileInputStream(args(2)))) - } else { - ConllPatternBasedRelationFinder - } - - val pipelineElements = Seq( - OntonotesForwardPosTagger, - NoEmbeddingsConllStackedChainNer, - OntonotesTransitionBasedParser, - ParseForwardCoref, - relFinder - ) - val annoMap = DocumentAnnotatorPipeline.defaultDocumentAnnotationMap.toMap ++ Seq(classOf[RelationMentionSeq] -> (() => relFinder)) - val pipeline = DocumentAnnotatorPipeline(annoMap, Nil, pipelineElements.flatMap(_.postAttrs)) - println("loaded document") - pipeline process doc - println("processed pipeline") - val relMentions = doc.attr[RelationMentionSeq].value - - println("Detected Mentions: ") - doc.coref.mentions.foreach { mention => - println(mention.phrase.string + " with type " + mention.phrase.head.nerTag.baseCategoryValue + " in sentence " + mention.phrase.sentence.string) - } - - println("writing mentions") - relMentions.foreach { rm => - rm.relations.value.foreach { relation => - if(rm.isArg1First) { - println(rm.arg1.string + " " + relation.value + " " + rm.arg2.string + " %.4f ".format(relation.confidence) + relation.provenance) - } else { - println(rm.arg2.string + " " + relation.value + " " + rm.arg1.string + " %.4f ".format(relation.confidence) + relation.provenance) - } - } - } - } -} -*/ - -object TACNerDomain extends CategoricalDomain[String] { - this ++= "O ORG GPE_CITY GPE_COUNTRY GPE_STATE DATE PERSON CARDINAL AFFILIATION PERSON WEBSITE CAUSE_OF_DEATH LAW RELIGION TITLE".split(' ') - freeze() -} - -class TACNerTag(token:Token, initialCategory:String) extends NerTag(token, initialCategory) { - def domain = TACNerDomain -} -/* -object GoldRelation { - - def annotate(t:Token, annos:Seq[String]):Seq[MutableCategoricalVar[String]] = { - annos.headOption.map(a => new TACNerTag(t, a)).toSeq - } - - def main (args:Array[String]) { - - val relFinder = if(args.length >= 3) { - new PatternBasedRelationFinder(PatternRelationPredictor.predictorsFromStreams(new FileInputStream(args(1)), new FileInputStream(args(2)))) - } else { - ConllPatternBasedRelationFinder - } - - val doc = LoadOWPL.fromFilename(args(0), annotate).head - - val coref = doc.getCoref - - var tokens = mutable.ArrayBuffer[Token]() - val iter = doc.tokens.iterator - - while(iter.hasNext) { - val t = iter.next() - println("PRocessing: " + t.string) - if(t.nerTag.baseCategoryValue != "O") { - tokens.append(t) - } else if (tokens.length > 0) { - val ts = new TokenSpan(tokens) - println("adding mention: " + ts.string) - coref.addMention(new Phrase(ts)) - tokens.clear() - } - } - if(tokens.length > 0) { - val ts = new TokenSpan(tokens) - println("adding mention: " + ts.string) - coref.addMention(new Phrase(ts)) - tokens.clear() - } - - relFinder.process(doc) - val relMentions = doc.attr[RelationMentionSeq].value - - println("Detected Mentions: ") - doc.coref.mentions.foreach { mention => - println(mention.phrase.string + " with type " + mention.phrase.head.nerTag.baseCategoryValue + " in sentence " + mention.phrase.sentence.string) - } - - println("writing mentions") - relMentions.foreach { rm => - rm.relations.value.foreach { relation => - if(rm.isArg1First) { - println(rm.arg1.string + " " + relation.value + " " + rm.arg2.string + " %.4f ".format(relation.confidence) + relation.provenance) - } else { - println(rm.arg2.string + " " + relation.value + " " + rm.arg1.string + " %.4f ".format(relation.confidence) + relation.provenance) - } - } - } - } -} -*/ \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/nlp/relation/RelationMention.scala b/src/main/scala/cc/factorie/app/nlp/relation/RelationMention.scala index 9212552..9ac5363 100644 --- a/src/main/scala/cc/factorie/app/nlp/relation/RelationMention.scala +++ b/src/main/scala/cc/factorie/app/nlp/relation/RelationMention.scala @@ -1,55 +1,14 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ package cc.factorie.app.nlp.relation -import cc.factorie.app.nlp.coref._ +import cc.factorie.app.nlp.coref.Mention import cc.factorie.util.Attr -import cc.factorie.variable._ +import cc.factorie.variable.ArrowVariable -import scala.collection.mutable._ - -object RelationArgFeaturesDomain extends CategoricalDomain[String] - -@deprecated("Marked for Possible Deletion", "Before 2014-11-17") -class ArgFeatures(val arg: Mention, val first: Boolean) extends BinaryFeatureVectorVariable[String] { - def domain = RelationArgFeaturesDomain - - def compute() = { - this += "BIAS" - // TODO compute relation features using "first" and "arg" - // TODO convert Lexicons (from refectorie.proj.jntinf) to app.chain.Lexicon - for (tok <- arg.phrase.tokens) { - this += "POS_" + tok.posTag.categoryValue - if (tok.string(0).isLower) - this += "STEM_" + tok.string.replaceAll("\\s+", " ").take(5) - } - - this += "HEAD_POS_" + arg.phrase.headToken.posTag.categoryValue - } -} - -class RelationMentionsSet extends SetVariable[RelationMention] - -class RelationMentionList extends ArrayBuffer[RelationMention]() with Attr - -case class TACRelation(value:String, confidence:Double, provenance:String) - -case class TACRelationList(value:Iterable[TACRelation]) - -class RelationMentionSeq extends SeqVariable[RelationMention] +import scala.collection.mutable.ArrayBuffer class RelationMention(val arg1: Mention, val arg2: Mention, var isArg1First:Boolean=true) extends ArrowVariable(arg1, arg2) with Attr { val _relations = ArrayBuffer[TACRelation]() this.attr += TACRelationList(_relations) def relations = this.attr[TACRelationList] } + diff --git a/src/main/scala/cc/factorie/app/nlp/relation/RelationMentionSeq.scala b/src/main/scala/cc/factorie/app/nlp/relation/RelationMentionSeq.scala new file mode 100644 index 0000000..c6d9b0c --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/relation/RelationMentionSeq.scala @@ -0,0 +1,5 @@ +package cc.factorie.app.nlp.relation + +import cc.factorie.variable.SeqVariable + +class RelationMentionSeq extends SeqVariable[RelationMention] diff --git a/src/main/scala/cc/factorie/app/nlp/relation/TACRelation.scala b/src/main/scala/cc/factorie/app/nlp/relation/TACRelation.scala new file mode 100644 index 0000000..4a7f440 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/relation/TACRelation.scala @@ -0,0 +1,3 @@ +package cc.factorie.app.nlp.relation + +case class TACRelation(value:String, confidence:Double, provenance:String) \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/nlp/relation/TACRelationList.scala b/src/main/scala/cc/factorie/app/nlp/relation/TACRelationList.scala new file mode 100644 index 0000000..7bba0b5 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/relation/TACRelationList.scala @@ -0,0 +1,5 @@ +package cc.factorie.app.nlp.relation + +import scala.collection.mutable.Iterable + +case class TACRelationList(value:Iterable[TACRelation]) diff --git a/src/main/scala/cc/factorie/app/nlp/segment/BigramStatistics.scala b/src/main/scala/cc/factorie/app/nlp/segment/BigramStatistics.scala deleted file mode 100644 index cbf17cf..0000000 --- a/src/main/scala/cc/factorie/app/nlp/segment/BigramStatistics.scala +++ /dev/null @@ -1,91 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.app.nlp.segment -import cc.factorie.app.nlp.Document - -import scala.collection.mutable.ArrayBuffer - -/** - * User: apassos - * Date: 8/19/13 - * Time: 2:00 PM - */ -class BigramStatistics { - val wordCounts = new collection.mutable.LinkedHashMap[String, Int]() - val bigramCounts = new collection.mutable.LinkedHashMap[(String,String),Int]() - var totalTokens = 0 - - def process(document: Document): Unit = { - for (token <- document.tokens) { - totalTokens += 1 - wordCounts(token.string) = 1 + wordCounts.getOrElse(token.string, 0) - token.getPrev.foreach(prev => { - bigramCounts((prev.string,token.string)) = 1 + bigramCounts.getOrElse((prev.string,token.string), 0) - }) - } - } - def process(documents: Iterable[Document]): Unit = documents.foreach(process) - - def aggregateCounts(others: Iterable[BigramStatistics]): Unit = { - for (other <- others) { - for ((unigram,value) <- other.wordCounts) { - wordCounts(unigram) = wordCounts.getOrElse(unigram, 0) + value - } - for ((bigram,value) <- other.bigramCounts) { - bigramCounts(bigram) = bigramCounts.getOrElse(bigram, 0) + value - } - totalTokens += other.totalTokens - } - } - - def processParallel(documents: Iterable[Document], nThreads: Int = Runtime.getRuntime.availableProcessors()): Unit = { - val others = new cc.factorie.util.ThreadLocal[BigramStatistics](new BigramStatistics) - cc.factorie.util.Threading.parForeach(documents, nThreads) { doc => - others.get.process(doc) - } - aggregateCounts(others.instances) - } - - def getLikelyPhrases(countThreshold: Int = 5, scoreThreshold: Double = 100.0): Seq[Seq[String]] = { - val bigramPhrases = collection.mutable.LinkedHashSet[Seq[String]]() - val phraseStarts = collection.mutable.HashMap[String,ArrayBuffer[String]]() - bigramCounts.foreach({ case ((prev,token),count) => - val pc = wordCounts(prev) - val pt = wordCounts(token) - if (count > countThreshold && pc > countThreshold && pt > countThreshold) { - // Pointwise mutual information is defined as P(A,B) / P(A) P(B). - // In this case P(A,B) = bigramCounts(A,B)/totalTokens , - // P(A) = wordCounts(A) / totalTokens, P(B) = wordCounts(B) / totalTokens - // Hence we can write PMI = bigramCounts(A,B) * totalTokens / (wordCounts(A) * wordCounts(B)) - val score = totalTokens * count.toDouble / (pc * pt) - if (score > scoreThreshold) { - bigramPhrases += Seq(prev,token) - phraseStarts.getOrElseUpdate(prev, new ArrayBuffer[String]).append(token) - } - } - }) - // now we should have all interesting bigrams. I'll make the assumption that - // if A B and B C are interesting phrases then A B C is interesting without checking. - val trigramPhrases = collection.mutable.HashSet[Seq[String]]() - bigramPhrases.foreach({ case Seq(prev,token) => - phraseStarts.getOrElse(token, Seq()).foreach(last => trigramPhrases += Seq(prev, token, last)) - }) - bigramPhrases.toSeq ++ trigramPhrases.toSeq - } - - def topMutualInformationBigrams(threshold: Int = 5): Seq[(String,String,Double)] = { - bigramCounts.toSeq.filter(_._2 > threshold).map({ case ((prev,token),count) => - ((prev,token),totalTokens * count.toDouble / (wordCounts(prev) * wordCounts(token))) - }).sortBy(-_._2).take(100).map({case ((prev,token),score) => (prev,token,score)}) - } -} diff --git a/src/main/scala/cc/factorie/app/nlp/segment/BritishToAmerican.scala b/src/main/scala/cc/factorie/app/nlp/segment/BritishToAmerican.scala new file mode 100644 index 0000000..df8c19f --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/segment/BritishToAmerican.scala @@ -0,0 +1,6 @@ +package cc.factorie.app.nlp.segment + +object BritishToAmerican extends scala.collection.mutable.HashMap[String,String] { + this("colour") = "color" + // TODO Add more, e.g. see http://oxforddictionaries.com/us/words/british-and-american-spelling +} \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/nlp/segment/ChainChineseWordSegmenter.scala b/src/main/scala/cc/factorie/app/nlp/segment/ChainChineseWordSegmenter.scala deleted file mode 100644 index 746cc79..0000000 --- a/src/main/scala/cc/factorie/app/nlp/segment/ChainChineseWordSegmenter.scala +++ /dev/null @@ -1,424 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.app.nlp.segment - -import java.io._ - -import cc.factorie._ -import cc.factorie.app.chain.ChainModel -import cc.factorie.app.chineseStrings._ -import cc.factorie.app.nlp._ -import cc.factorie.optimize.OnlineTrainer -import cc.factorie.util.BinarySerializer -import cc.factorie.util.CubbieConversions._ -import cc.factorie.variable._ - -import scala.collection.mutable.ArrayBuffer -import scala.util.Random - -/** A linear-chain CRF model for Chinese word segmentation with four companion - objects, each pre-trained on a different corpus that corresponds to a - different variety of written Mandarin. - @author Henry Oskar Singer */ - -class ChainChineseWordSegmenter( - labelDomain: SegmentationLabelDomain = BIOSegmentationDomain -) extends DocumentAnnotator { - - val singleCharWordTable = new CategoricalDomain[String] - val bigramTable = new CategoricalDomain[String] - val prefixTable = new CategoricalDomain[String] - val suffixTable = new CategoricalDomain[String] - val rareWordThreshold = 100 - - def this(filePath: String) { - this() - deserialize(filePath) - } - def this(dataStream: InputStream) { - this() - deserialize(dataStream) - } - def this(url: java.net.URL) = this(url.openConnection().getInputStream) - - def process(document: Document): Document = { - - //Since tokens are position-based, and the character tagger removes whitespace, - //its necessary to consider whitespace when creating tokens from tagged characters - val whiteSpaceOffsets = labelDomain.getWhiteSpaceOffsets(document.string) - val segmentedText = segment(document) - - var tokenStart = 0 - - ( 0 to segmentedText.size ).foreach{ i => - - if( i == 0 || isEndOfSentence(segmentedText(i - 1).character.string(0)) ) - new Sentence(document) - - if( i > 0 && (i == segmentedText.size || labelDomain.indicatesSegmentStart(segmentedText(i).categoryValue))){ - new Token(document, whiteSpaceOffsets(tokenStart) + tokenStart, whiteSpaceOffsets(i - 1) + i) - - tokenStart = i - } - } - - document - } - - //Word segmentation is the first thing that happens to Chinese text in the - //pipeline, so there shouldn't be any attrs before word segmentation - def prereqAttrs = Seq() - - def postAttrs = Seq(classOf[Token], classOf[Sentence]) - - def tokenAnnotationString(token: Token): String = { - token.string + "\t" - } - - def serialize(filePath: String): Unit = serialize(new FileOutputStream(new File(filePath))) - def serialize(stream: OutputStream): Unit = { - - val dataStream = new DataOutputStream(new BufferedOutputStream(stream)) - - BinarySerializer.serialize(singleCharWordTable.dimensionDomain, dataStream) - BinarySerializer.serialize(bigramTable.dimensionDomain, dataStream) - BinarySerializer.serialize(prefixTable.dimensionDomain, dataStream) - BinarySerializer.serialize(suffixTable.dimensionDomain, dataStream) - BinarySerializer.serialize(SegmentationFeaturesDomain.dimensionDomain, dataStream) - BinarySerializer.serialize(model, dataStream) - dataStream.close - } - - def deserialize(filePath: String): Unit = deserialize(new FileInputStream(new File(filePath))) - def deserialize(stream: InputStream): Unit = { - - val dataStream = new DataInputStream(new BufferedInputStream(stream)) - - BinarySerializer.deserialize(singleCharWordTable.dimensionDomain, dataStream) - BinarySerializer.deserialize(bigramTable.dimensionDomain, dataStream) - BinarySerializer.deserialize(prefixTable.dimensionDomain, dataStream) - BinarySerializer.deserialize(suffixTable.dimensionDomain, dataStream) - BinarySerializer.deserialize(SegmentationFeaturesDomain.dimensionDomain, dataStream) - BinarySerializer.deserialize(model, dataStream) - dataStream.close - } - - def train(filePaths: List[String]): Unit = { - - println("Training In Progress") - println("\tFeature Extraction In Progress") - - val labeledCorpora = filePaths.map( - filePath => labelDomain.getLabeledCharacters(new File(filePath)) - ).flatten.toIndexedSeq - - populateFeatureTables(labeledCorpora.flatten) - - val trainingSegmentables = getSegmentables(labeledCorpora) - - SegmentationFeaturesDomain.freeze - - println("\tFeature Extraction Completed") - - val examples = - trainingSegmentables.map( segmentable => - new model.ChainLikelihoodExample(segmentable.links.map( _.label )) - ).toList - - Random.setSeed(0) - - val shuffledExamples = Random.shuffle(examples) - val trainer = new OnlineTrainer(model.parameters) - - trainer.trainFromExamples(shuffledExamples) - - println("Training Complete\n") - } - - def populateFeatureTables(labeledCorpus: IndexedSeq[(String, String)]): Unit = { - - populateSingleCharWordTable(labeledCorpus) - populateBigramTable(labeledCorpus) - populateAffixTables(labeledCorpus) - } - - def populateAffixTables(labeledCorpus: IndexedSeq[(String, String)]): Unit = { - - val (prefixes, suffixes) = getAffixes(labeledCorpus) - - prefixTable.clear - prefixes.foreach( prefix => prefixTable.index(prefix) ) - prefixTable.freeze - - suffixTable.clear - suffixes.foreach( suffix => suffixTable.index(suffix) ) - suffixTable.freeze - } - - def getAffixes(labeledCorpus: IndexedSeq[(String, String)]): (List[String], List[String]) = { - - val words = getWords(labeledCorpus).filter( word => word.length > 1 ) - val tempDomain = new CategoricalDomain[String] - - tempDomain.gatherCounts = true - words.foreach( word => tempDomain.index(word) ) - tempDomain.trimAboveCount(rareWordThreshold) - - val rareWords = tempDomain.categories.toList - val prefixes = rareWords.map( - word => word.slice(0,1) - ).distinct - val suffixes = rareWords.map( - word => word.slice(word.size-1, word.size) - ).distinct - - (prefixes, suffixes) - } - - def getWords(labeledCorpus: IndexedSeq[(String, String)]): List[String] = { - - val delimiter = '|' - - labeledCorpus.map( - pair => { - if ( labelDomain.indicatesSegmentStart(pair._2) ) - delimiter+pair._1 - else - pair._1 - } - ).mkString.split(delimiter).toList - } - - def populateBigramTable(labeledCorpus: IndexedSeq[(String, String)]): Unit = { - - val bigrams = getBigrams(labeledCorpus) - - bigramTable.clear - bigrams.foreach( bigram => bigramTable.index(bigram) ) - bigramTable.freeze - } - - def getBigrams(labeledCorpus: IndexedSeq[(String, String)]): List[String] = { - - val charsOnly = labeledCorpus.map( pair => pair._1 ) - val bigramZip = ("0" +: charsOnly).zip(charsOnly :+ "0").slice(1, charsOnly.size) - - bigramZip.map( pair => pair._1 + pair._2 ).toList.distinct - } - - def populateSingleCharWordTable(labeledCorpus: IndexedSeq[(String, String)]): Unit = { - - val onlySingleCharWords = getOnlySingleCharWords(labeledCorpus) - - singleCharWordTable.clear - onlySingleCharWords.foreach( char => singleCharWordTable.index(char) ) - singleCharWordTable.freeze - } - - def getOnlySingleCharWords(labeledCorpus: IndexedSeq[(String, String)]): List[String] = { - - val (singleInstances, nonSingleInstances) = labeledCorpus.partition( - pair => labelDomain.isSolitary(pair._2) - ) - val singleChars = singleInstances.map( pair => pair._1 ).toSet - val nonSingleChars = nonSingleInstances.map( pair => pair._1 ).toSet - - (singleChars -- (singleChars & nonSingleChars)).toList - } - - object SegmentationFeaturesDomain extends CategoricalVectorDomain[String] - class SegmentationFeatures(val features: Seq[String]) - extends BinaryFeatureVectorVariable[String] { - - override def skipNonCategories = true - def domain = SegmentationFeaturesDomain - - this ++= features - } - - class Character(character: String, labelString: String, featureSeq: Seq[String]) - extends app.chain.Observation[Character] - with ChainLink[Character, Segmentable] { - - val features = new SegmentationFeatures(featureSeq) - val label = new SegmentationLabel(labelString, this) - - def string = character - } - - class SegmentationLabel(labelName: String, val character: Character) - extends LabeledCategoricalVariable(labelName) { - - def domain = labelDomain - } - - class Segmentable extends variable.Chain[Segmentable, Character] - - val model = new ChainModel[SegmentationLabel, SegmentationFeatures, Character]( - labelDomain, - SegmentationFeaturesDomain, - label => label.character.features, - label => label.character, - character => character.label - ) - - def getF1Score(trainPath: String, logPath: String): Double = { - - val labelSeq = segment(trainPath) - val myWords = new ArrayBuffer[ArrayBuffer[SegmentationLabel]] - val numTrueWords: Double = labelSeq.count( - label => labelDomain.indicatesSegmentStart(label.target.categoryValue) - ) - - labelSeq.foreach{ label => - if ( !labelDomain.indicatesSegmentStart(label.categoryValue) && myWords.size > 0 ) - myWords(myWords.size - 1) += label - else - myWords += (new ArrayBuffer[SegmentationLabel] :+ label) - } - - val pad = new ArrayBuffer[SegmentationLabel] - val wordZip = (pad +: myWords).zip(myWords :+ pad).slice(1, myWords.size) - val numCorrect: Double = wordZip.count{ - wordPair => wordPair._1.forall( label => label.valueIsTarget ) && - labelDomain.indicatesSegmentStart(wordPair._2(0).target.categoryValue) - } - val printer = new PrintWriter(new BufferedWriter(new FileWriter(logPath))) - val printString: String = myWords.filter( word => - word.exists( label => !label.valueIsTarget ) - ).map( - word => word.map( label => label.character.string + - "/" + label.categoryValue + - "/" + label.target.categoryValue + - "\t" - ).reduceLeft(_+_) - ).reduceLeft( - (x,y) => x + "\n" + y - ) - - printer.write(printString) - printer.close - - val precision = numCorrect / myWords.size - val recall = numCorrect / numTrueWords - - println("Precision: " + precision + "\tRecall: " + recall) - 2 * (precision * recall)/(precision + recall) - } - - def segment(filePath: String): IndexedSeq[SegmentationLabel] = segment(getSegmentables(new File(filePath))) - def segment(document: Document): IndexedSeq[SegmentationLabel] = segment(getSegmentables(document)) - def segment(segmentables: IndexedSeq[Segmentable]): IndexedSeq[SegmentationLabel] = { - - val labelSeqs = segmentables.map( _.links.map( _.label ) ) - - labelSeqs.foreach( labelSeq => model.maximize(labelSeq)(null) ) - - labelSeqs.flatten - } - - def getSegmentables(corpus: File): IndexedSeq[Segmentable] = { - - val labeledExamples: IndexedSeq[IndexedSeq[(String, String)]] = labelDomain.getLabeledCharacters(corpus) - - getSegmentables(labeledExamples) - } - def getSegmentables(document: Document): IndexedSeq[Segmentable] = { - - val labeledExamples: IndexedSeq[IndexedSeq[(String, String)]] = labelDomain.getLabeledCharacters(document) - - getSegmentables(labeledExamples) - } - def getSegmentables(labeledExamples: IndexedSeq[IndexedSeq[(String, String)]]): IndexedSeq[Segmentable] = { - - val segmentables = labeledExamples.map( - example => new Segmentable ++= (0 until example.size).map( - i => new Character(example(i)._1, example(i)._2, characterToFeatures(i, example)) - ) - ) - - println("Segmentables Retrieved") - - segmentables - } - - //Returns the list of features for a character in an unsegmented data set - //Labeling scheme: PP (prev prev) P (prev) N (next) NN (next next) *L (* label) - def characterToFeatures(i: Int, labeledCharacters: IndexedSeq[(String, String)]): Seq[String] = { - - val defaultFeature = "INVALID" - val numChars = labeledCharacters.size - val cneg2label = "C-2" - val cneg1label = "C-1" - val c0label = "C0" - val cpos1label = "C+1" - val cpos2label = "C+2" - - val cneg2 = - if( i - 2 >= 0 ) labeledCharacters(i-2)._1 - else defaultFeature - val cneg1 = - if( i - 1 >= 0 ) labeledCharacters(i-1)._1 - else defaultFeature - val c0 = labeledCharacters(i)._1 - val cpos1 = - if( i + 1 < numChars ) labeledCharacters(i+1)._1 - else defaultFeature - val cpos2 = - if( i + 2 < numChars ) labeledCharacters(i+2)._1 - else defaultFeature - val features = new ArrayBuffer[String] - - //Add unigram character identity features - features ++= Seq( - cneg2 + cneg2label, - cneg1 + cneg1label, - c0 + c0label, - cpos1 + cpos1label - ) - - //Add bigram character identity features - features ++= Seq( - cneg2 + cneg1 + cneg2label + cneg1label, - cneg1 + c0 + cneg1label + c0label, - cneg1 + cpos1 + cneg1label + cpos1label, - c0 + cpos1 + c0label + cpos1label, - c0 + cpos2 + c0label + cpos2label - ) - - //Add feature functions including reduplication, known bigram, - //solitary character, prefix and affix - features ++= List( - (cneg1 equals c0, "RE" + cneg1label + c0label), - (cneg1 equals cpos1, "RE" + cneg1label + cpos1label), - (tableContains(bigramTable, cneg1+c0), "BI" + cneg1 + c0), - (tableContains(singleCharWordTable, cneg1), "UN" + cneg1), - (tableContains(singleCharWordTable, c0), "UN" + c0), - (tableContains(singleCharWordTable, cpos1), "UN" + cpos1), - (tableContains(prefixTable, cneg1), "PR" + cneg1), - (tableContains(suffixTable, c0), "SU" + c0) - ).filter( pair => pair._1 ).map( pair => pair._2 ).toList - - features.toList.filter( feature => !feature.contains(defaultFeature) ).toSeq - } - - def tableContains(domain: CategoricalDomain[String], element: String): Boolean = { - - try { - domain.getIndex(element) - return true - } catch { - case _ : Throwable => return false - } - } -} diff --git a/src/main/scala/cc/factorie/app/nlp/segment/ChineseSegLabelDomains.scala b/src/main/scala/cc/factorie/app/nlp/segment/ChineseSegLabelDomains.scala deleted file mode 100644 index 51b7c8f..0000000 --- a/src/main/scala/cc/factorie/app/nlp/segment/ChineseSegLabelDomains.scala +++ /dev/null @@ -1,117 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.app.nlp.segment - -import java.io.File - -import cc.factorie.app.chineseStrings._ -import cc.factorie.app.nlp._ -import cc.factorie.variable._ - -import scala.collection.mutable.ArrayBuffer - -abstract class SegmentationLabelDomain - extends CategoricalDomain[String] - with SegmentedCorpusLabeling - -object BIOSegmentationDomain extends SegmentationLabelDomain { - - this ++= Vector( - "RR", - "LR", - "LL", - "MM" - ) - - freeze - - def indicatesSegmentStart(label: String): Boolean = { - val segmentStarts = List( "LL", "LR" ) - - segmentStarts.exists( segStart => segStart equals label ) - } - - def isSolitary(label: String): Boolean = label equals "LR" - - def getLabeledCharacter(i: Int, line: String): (String, String) = { - - val label = - if(isFirst(i, line) && isLast(i, line)) "LR" - else if(isFirst(i, line)) "LL" - else if(isLast(i, line)) "RR" - else "MM" - - (line.slice(i, i+1), label) - } -} - -trait SegmentedCorpusLabeling { - - def indicatesSegmentStart(label: String): Boolean - - def isSolitary(label: String): Boolean - - def getLabeledCharacters(corpus: File): IndexedSeq[IndexedSeq[(String, String)]] = { - - val fileLines = scala.io.Source.fromFile(corpus, "utf-8").getLines.toList - val labeledCorpus = - fileLines.map( - line => (0 until line.size).filter( - i => !isWhiteSpace(line(i)) - ).map( - i => getLabeledCharacter(i, line) - ).toIndexedSeq - ).toIndexedSeq - - labeledCorpus - } - - def getLabeledCharacters(document: Document): IndexedSeq[IndexedSeq[(String, String)]] = { - - val docString = document.string - val labeledCorpus = (0 until docString.size).filter( - i => !isWhiteSpace(docString(i)) - ).map( - i => getLabeledCharacter(i, docString) - ).toIndexedSeq - - IndexedSeq(labeledCorpus) - } - - def getLabeledCharacter(i: Int, line: String): (String, String) - - def getWhiteSpaceOffsets(content: String): IndexedSeq[Int] = { - - val offsets = new ArrayBuffer[Int] - - offsets += 0 - - var count = 0 - - ( 0 until content.size ).foreach{ i => - if(isWhiteSpace(content(i))) count += 1 - else offsets += count - } - - offsets - } - - //Checks if a character in a training set is first in a word - def isFirst(i: Int, line: String): Boolean = - (i == 0 || isWhiteSpace(line(i-1)) && !isWhiteSpace(line(i))) - - //Checks if a character in a training set is last in a word - def isLast(i: Int, line: String): Boolean = - (i == (line.size - 1) || isWhiteSpace(line(i+1)) && !isWhiteSpace(line(i))) - -} diff --git a/src/main/scala/cc/factorie/app/nlp/segment/DehyphenatingTokenizer.scala b/src/main/scala/cc/factorie/app/nlp/segment/DehyphenatingTokenizer.scala deleted file mode 100644 index 6c342ab..0000000 --- a/src/main/scala/cc/factorie/app/nlp/segment/DehyphenatingTokenizer.scala +++ /dev/null @@ -1,84 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.app.nlp.segment - -import cc.factorie.app.nlp._ - - -/** - * concatenates words split by hyphens in the original text based on user-provided dictionary - * or other words in the same document. It works on the output of the tokenizer. - * Caution: It modifies the output of the tokenizer by removing some tokens so run this before any other downstream tasks. - * @param tokenizer tokenizer to use to tokenize the doc. Default is DeterministicTokenizer - * @param dictionary dictionary to lookup to check for merge eligibility - * @param useTokens if true, other tokens in document are used to check for merge eligibility - * @author harshal - */ -class DehyphenatingTokenizer[T <: DocumentAnnotator](tokenizer: T = DeterministicNormalizingTokenizer, dictionary: Set[String] = Set.empty[String], useTokens: Boolean) extends DocumentAnnotator { - - def tokenize(document: Document) = tokenizer.process(document) - - def process(document: Document) = { - val tokenizedDoc = tokenize(document) - - lazy val dictionaryFromDocWords = buildDictionaryFromDocWords(tokenizedDoc.tokens) - - def eligibleForMerge(first: String, last:String) = dictionary((first+last).toLowerCase) || (useTokens && dictionaryFromDocWords((first+last).toLowerCase)) - - var _skipCounter = 0 - - for(section <- tokenizedDoc.sections){ - if(section.tokens.size>2){ //if the section has less than 3 tokens, nothing to do - var lastWindow: IndexedSeq[Token] = null - for(tokens <- section.tokens.sliding(3).toList){ - lastWindow = tokens - if(_skipCounter==0 && tokens(1).string=="-" && tokens(2).hasFollowingWhitespace - && eligibleForMerge(tokens(0).string, tokens(2).string)) { - val first = tokens.head - val last = tokens.last - //create a new token and set it's string offset to the first to the last token - val t = new Token(first.stringStart, last.stringEnd) - //add a TokenString attr to output the concatenated string - t.attr += new TokenString(first, first.string+last.string) - section.remove(first.positionInSection) - section.insert(first.positionInSection, t) - //next two windows must be skipped - _skipCounter = 2 - } - else{ - //removes the next two tokens after a merge - if(_skipCounter != 0) { - section.remove(tokens(0).positionInSection) - _skipCounter-=1 - } - } - } - //if the last window was merged then - if(_skipCounter>0){ - section.remove(lastWindow(1).positionInSection) - section.remove(lastWindow(2).positionInSection) - } - } - } - tokenizedDoc - } - - def buildDictionaryFromDocWords(tokens: Iterable[Token]) = tokens.filterNot(_.isPunctuation).map(_.string).toSet - - def prereqAttrs: Iterable[Class[_]] = Nil - def postAttrs: Iterable[Class[_]] = List(classOf[Token]) - - /** How the annotation of this DocumentAnnotator should be printed in one-word-per-line (OWPL) format. - If there is no per-token annotation, return null. Used in Document.owplString. */ - def tokenAnnotationString(token: Token) = token.stringStart.toString+'\t'+token.stringEnd.toString -} \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/nlp/segment/DeterministicLexerTokenizer.scala b/src/main/scala/cc/factorie/app/nlp/segment/DeterministicLexerTokenizer.scala index b993eb5..de67395 100644 --- a/src/main/scala/cc/factorie/app/nlp/segment/DeterministicLexerTokenizer.scala +++ b/src/main/scala/cc/factorie/app/nlp/segment/DeterministicLexerTokenizer.scala @@ -1,53 +1,42 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - package cc.factorie.app.nlp.segment import java.io.StringReader import cc.factorie.app.nlp.{Document, DocumentAnnotator, Token} -/** Split a String into a sequence of Tokens. Aims to adhere to tokenization rules used in Ontonotes and Penn Treebank. - Note that CoNLL tokenization would use tokenizeAllDashedWords=true. - Punctuation that ends a sentence should be placed alone in its own Token, hence this segmentation implicitly - defines sentence segmentation also. (Although our the DeterministicSentenceSegmenter does make a few adjustments - beyond this tokenizer.) This tokenizer can also normalize. - This tokenizer is significantly faster than the DeterministicRegexTokenizer, using the EnglishLexer DFA - automatically generated by JFlex using the definition in EnglishLexer.flex. + +/** Split a String into a sequence of Tokens. Aims to adhere to tokenization rules used in Ontonotes and Penn Treebank. + * Note that CoNLL tokenization would use tokenizeAllDashedWords=true. + * Punctuation that ends a sentence should be placed alone in its own Token, hence this segmentation implicitly + * defines sentence segmentation also. (Although our the DeterministicSentenceSegmenter does make a few adjustments + * beyond this tokenizer.) This tokenizer can also normalize. + ** + *This tokenizer is significantly faster than the DeterministicRegexTokenizer, using the EnglishLexer DFA + *automatically generated by JFlex using the definition in EnglishLexer.flex. */ class DeterministicLexerTokenizer( - val tokenizeSgml:Boolean = false, // Keep sgml/html tags as tokens - val tokenizeNewline:Boolean = false, // Keep newlines as tokens - val tokenizeWhitespace:Boolean = false, // Keep all whitespace, including newlines, as tokens - val tokenizeAllDashedWords:Boolean = false, // Separate dashed words into separate tokens, such as in CoNLL - val abbrevPrecedesLowercase:Boolean = false, // Assume a period followed by a lower case word is an abbrev and not end of sentence (see below) - val normalize: Boolean = true, // Whether to normalize token strings - val normalizeQuote:Boolean = true, // Convert all double quotes to " - val normalizeApostrophe:Boolean = true, // Convert all apostrophes to ', even within token strings - val normalizeCurrency:Boolean = true, // Convert all currency symbols to "$", except cents symbol to "cents" - val normalizeAmpersand:Boolean = true, // Convert all ampersand symbols (including "&" to "&" - val normalizeFractions:Boolean = true, // Convert unicode fraction characters to their spelled out analogues, like "3/4" - val normalizeEllipsis:Boolean = true, // Convert unicode ellipsis character to spelled out analogue, "..." - val undoPennParens:Boolean = true, // Change -LRB- etc to "(" etc. - val unescapeSlash:Boolean = true, // Change \/ to / - val unescapeAsterisk:Boolean = true, // Change \* to * - val normalizeMDash:Boolean = true, // Convert all em-dashes to double dash -- - val normalizeDash:Boolean = true, // Convert all other dashes to single dash - - val normalizeHtmlSymbol:Boolean = true, // Convert < to <, etc - val normalizeHtmlAccent:Boolean = true // Convert Beyoncé to Beyonce - ) extends DocumentAnnotator { + val tokenizeSgml:Boolean = false, // Keep sgml/html tags as tokens + val tokenizeNewline:Boolean = false, // Keep newlines as tokens + val tokenizeWhitespace:Boolean = false, // Keep all whitespace, including newlines, as tokens + val tokenizeAllDashedWords:Boolean = false, // Separate dashed words into separate tokens, such as in CoNLL + val abbrevPrecedesLowercase:Boolean = false, // Assume a period followed by a lower case word is an abbrev and not end of sentence (see below) + val normalize: Boolean = true, // Whether to normalize token strings + val normalizeQuote:Boolean = true, // Convert all double quotes to " + val normalizeApostrophe:Boolean = true, // Convert all apostrophes to ', even within token strings + val normalizeCurrency:Boolean = true, // Convert all currency symbols to "$", except cents symbol to "cents" + val normalizeAmpersand:Boolean = true, // Convert all ampersand symbols (including "&" to "&" + val normalizeFractions:Boolean = true, // Convert unicode fraction characters to their spelled out analogues, like "3/4" + val normalizeEllipsis:Boolean = true, // Convert unicode ellipsis character to spelled out analogue, "..." + val undoPennParens:Boolean = true, // Change -LRB- etc to "(" etc. + val unescapeSlash:Boolean = true, // Change \/ to / + val unescapeAsterisk:Boolean = true, // Change \* to * + val normalizeMDash:Boolean = true, // Convert all em-dashes to double dash -- + val normalizeDash:Boolean = true, // Convert all other dashes to single dash - + val normalizeHtmlSymbol:Boolean = true, // Convert < to <, etc + val normalizeHtmlAccent:Boolean = true // Convert Beyoncé to Beyonce + ) extends DocumentAnnotator { /** How the annotation of this DocumentAnnotator should be printed in one-word-per-line (OWPL) format. If there is no per-token annotation, return null. Used in Document.owplString. */ @@ -55,20 +44,20 @@ class DeterministicLexerTokenizer( val lexer = // here we make sure that if normalize = false, we really don't normalize anything - if(normalize) - new EnglishLexer(null, tokenizeSgml, tokenizeNewline, tokenizeWhitespace, tokenizeAllDashedWords, abbrevPrecedesLowercase, - normalizeQuote, normalizeApostrophe, normalizeCurrency, normalizeAmpersand, normalizeFractions, normalizeEllipsis, - undoPennParens, unescapeSlash, unescapeAsterisk, normalizeMDash, normalizeDash, normalizeHtmlSymbol, normalizeHtmlAccent) - else - new EnglishLexer(null, tokenizeSgml, tokenizeNewline, tokenizeWhitespace, tokenizeAllDashedWords, abbrevPrecedesLowercase, - false, false, false, false, false, false, false, false, false, false, false, false, false) + if(normalize) + new EnglishLexer(null, tokenizeSgml, tokenizeNewline, tokenizeWhitespace, tokenizeAllDashedWords, abbrevPrecedesLowercase, + normalizeQuote, normalizeApostrophe, normalizeCurrency, normalizeAmpersand, normalizeFractions, normalizeEllipsis, + undoPennParens, unescapeSlash, unescapeAsterisk, normalizeMDash, normalizeDash, normalizeHtmlSymbol, normalizeHtmlAccent) + else + new EnglishLexer(null, tokenizeSgml, tokenizeNewline, tokenizeWhitespace, tokenizeAllDashedWords, abbrevPrecedesLowercase, + false, false, false, false, false, false, false, false, false, false, false, false, false) def process(document: Document): Document = { for (section <- document.sections) { /* Add this newline to avoid JFlex issue where we can't match EOF with lookahead */ val reader = new StringReader(section.string + "\n") lexer.yyreset(reader) - + var currentToken = lexer.yylex().asInstanceOf[(String, Int, Int)] while (currentToken != null){ if (abbrevPrecedesLowercase && section.length > 1 && section.tokens.last.string == "." && java.lang.Character.isLowerCase(currentToken._1(0)) && section.tokens(section.length-2).stringEnd == section.tokens(section.length-1).stringStart) { @@ -95,91 +84,4 @@ class DeterministicLexerTokenizer( /** Convenience function to run the tokenizer on an arbitrary String. The implementation builds a Document internally, then maps to token strings. */ def apply(s:String): Seq[String] = process(new Document(s)).tokens.toSeq.map(_.string) -} - - -/* This version does not perform normalization, only tokenization */ - -object DeterministicTokenizer extends DeterministicLexerTokenizer( - tokenizeSgml = false, - tokenizeNewline = false, - tokenizeWhitespace = false, - tokenizeAllDashedWords = false, - abbrevPrecedesLowercase = false, - normalize = false, - normalizeQuote = false, - normalizeApostrophe = false, - normalizeCurrency = false, - normalizeAmpersand = false, - normalizeFractions = false, - normalizeEllipsis = false, - undoPennParens = false, - unescapeSlash = false, - unescapeAsterisk = false, - normalizeMDash = false, - normalizeDash = false, - normalizeHtmlSymbol = false, - normalizeHtmlAccent = false -) - -/* This version performs normalization while it tokenizes, and also includes html tags as tokens */ -object DeterministicNormalizingHtmlTokenizer extends DeterministicLexerTokenizer( - tokenizeSgml = true, - tokenizeNewline = false, - tokenizeWhitespace = false, - tokenizeAllDashedWords = false, - abbrevPrecedesLowercase = false, - normalize = true, - normalizeQuote = true, - normalizeApostrophe = true, - normalizeCurrency = true, - normalizeAmpersand = true, - normalizeFractions = true, - normalizeEllipsis = true, - undoPennParens = true, - unescapeSlash = true, - unescapeAsterisk = true, - normalizeMDash = true, - normalizeDash = true, - normalizeHtmlSymbol = true, - normalizeHtmlAccent = true -) - -/* This token performs normalization while it tokenizes, removing html tags; You probably want to use this one */ - -object DeterministicNormalizingTokenizer extends DeterministicLexerTokenizer( - tokenizeSgml = false, - tokenizeNewline = false, - tokenizeWhitespace = false, - tokenizeAllDashedWords = false, - abbrevPrecedesLowercase = false, - normalize = true, - normalizeQuote = true, - normalizeApostrophe = true, - normalizeCurrency = true, - normalizeAmpersand = true, - normalizeFractions = true, - normalizeEllipsis = true, - undoPennParens = true, - unescapeSlash = true, - unescapeAsterisk = true, - normalizeMDash = true, - normalizeDash = true, - normalizeHtmlSymbol = true, - normalizeHtmlAccent = true -){ - /* For testing purposes: Tokenizes and normalizes input from stdin using DeterministicNormalizingTokenizer */ - /* - def main(args: Array[String]): Unit = { - val string = io.Source.fromInputStream(System.in).mkString -// println("Tokenizing...") - val doc = new Document(string) - val t0 = System.currentTimeMillis() - DeterministicNormalizingTokenizer.process(doc) - val time = System.currentTimeMillis()-t0 - println(s"Processed ${doc.tokenCount} tokens in ${time}ms (${doc.tokenCount.toDouble/time*1000} tokens/second)") - println(doc.tokens.map(_.string).mkString("\n")) - } - */ -} - +} \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/nlp/segment/DeterministicNormalizingHtmlTokenizer.scala b/src/main/scala/cc/factorie/app/nlp/segment/DeterministicNormalizingHtmlTokenizer.scala new file mode 100644 index 0000000..67efc8e --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/segment/DeterministicNormalizingHtmlTokenizer.scala @@ -0,0 +1,24 @@ +package cc.factorie.app.nlp.segment + +/* This version performs normalization while it tokenizes, and also includes html tags as tokens */ +object DeterministicNormalizingHtmlTokenizer extends DeterministicLexerTokenizer( + tokenizeSgml = true, + tokenizeNewline = false, + tokenizeWhitespace = false, + tokenizeAllDashedWords = false, + abbrevPrecedesLowercase = false, + normalize = true, + normalizeQuote = true, + normalizeApostrophe = true, + normalizeCurrency = true, + normalizeAmpersand = true, + normalizeFractions = true, + normalizeEllipsis = true, + undoPennParens = true, + unescapeSlash = true, + unescapeAsterisk = true, + normalizeMDash = true, + normalizeDash = true, + normalizeHtmlSymbol = true, + normalizeHtmlAccent = true +) diff --git a/src/main/scala/cc/factorie/app/nlp/segment/DeterministicNormalizingTokenizer.scala b/src/main/scala/cc/factorie/app/nlp/segment/DeterministicNormalizingTokenizer.scala new file mode 100644 index 0000000..cd40aaf --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/segment/DeterministicNormalizingTokenizer.scala @@ -0,0 +1,25 @@ +package cc.factorie.app.nlp.segment + +/* This token performs normalization while it tokenizes, removing html tags; You probably want to use this one */ + +object DeterministicNormalizingTokenizer extends DeterministicLexerTokenizer( + tokenizeSgml = false, + tokenizeNewline = false, + tokenizeWhitespace = false, + tokenizeAllDashedWords = false, + abbrevPrecedesLowercase = false, + normalize = true, + normalizeQuote = true, + normalizeApostrophe = true, + normalizeCurrency = true, + normalizeAmpersand = true, + normalizeFractions = true, + normalizeEllipsis = true, + undoPennParens = true, + unescapeSlash = true, + unescapeAsterisk = true, + normalizeMDash = true, + normalizeDash = true, + normalizeHtmlSymbol = true, + normalizeHtmlAccent = true +) diff --git a/src/main/scala/cc/factorie/app/nlp/segment/DeterministicRegexTokenizer.scala b/src/main/scala/cc/factorie/app/nlp/segment/DeterministicRegexTokenizer.scala index d6ca209..96b5499 100644 --- a/src/main/scala/cc/factorie/app/nlp/segment/DeterministicRegexTokenizer.scala +++ b/src/main/scala/cc/factorie/app/nlp/segment/DeterministicRegexTokenizer.scala @@ -16,10 +16,11 @@ package cc.factorie.app.nlp.segment import cc.factorie.app.nlp.{Document, DocumentAnnotator, Token} /** Split a String into a sequence of Tokens. Aims to adhere to tokenization rules used in Ontonotes and Penn Treebank. - Note that CoNLL tokenization would use tokenizeAllDashedWords=true. - Punctuation that ends a sentence should be placed alone in its own Token, hence this segmentation implicitly defines sentence segmentation also. - (Although our the DeterministicSentenceSegmenter does make a few adjustments beyond this tokenizer.) - @author Andrew McCallum + *Note that CoNLL tokenization would use tokenizeAllDashedWords=true. + *Punctuation that ends a sentence should be placed alone in its own Token, hence this segmentation implicitly defines sentence segmentation also. + *(Although our the DeterministicSentenceSegmenter does make a few adjustments beyond this tokenizer.) + * + *@author Andrew McCallum */ class DeterministicRegexTokenizer(caseSensitive:Boolean = false, tokenizeSgml:Boolean = false, tokenizeNewline:Boolean = false, tokenizeAllDashedWords:Boolean = false, abbrevPrecedesLowercase:Boolean = false) extends DocumentAnnotator { diff --git a/src/main/scala/cc/factorie/app/nlp/segment/DeterministicSentenceSegmenter.scala b/src/main/scala/cc/factorie/app/nlp/segment/DeterministicSentenceSegmenter.scala index ce760b8..574f4ea 100644 --- a/src/main/scala/cc/factorie/app/nlp/segment/DeterministicSentenceSegmenter.scala +++ b/src/main/scala/cc/factorie/app/nlp/segment/DeterministicSentenceSegmenter.scala @@ -12,10 +12,13 @@ limitations under the License. */ package cc.factorie.app.nlp.segment -import cc.factorie.app.nlp._ +import cc.factorie.app.nlp.lexicon.StopWords +import cc.factorie.app.nlp.{Document, DocumentAnnotator, Sentence, Token} + /** Segments a sequence of tokens into sentences. - @author Andrew McCallum */ + * + * @author Andrew McCallum */ class DeterministicSentenceSegmenter extends DocumentAnnotator { /** How the annotation of this DocumentAnnotator should be printed in one-word-per-line (OWPL) format. @@ -48,7 +51,7 @@ class DeterministicSentenceSegmenter extends DocumentAnnotator { val charOffsetBoundary = 10 /** Returns true for strings that probably start a sentence after a word that ends with a period. */ - def possibleSentenceStart(s:String): Boolean = java.lang.Character.isUpperCase(s(0)) && (cc.factorie.app.nlp.lexicon.StopWords.containsWord(s) || s == "Mr." || s == "Mrs." || s == "Ms." || s == "\"" || s == "''") // Consider adding more honorifics and others here. -akm + def possibleSentenceStart(s:String): Boolean = java.lang.Character.isUpperCase(s(0)) && (StopWords.containsWord(s) || s == "Mr." || s == "Mrs." || s == "Ms." || s == "\"" || s == "''") // Consider adding more honorifics and others here. -akm def process(document: Document): Document = { @@ -121,17 +124,4 @@ class DeterministicSentenceSegmenter extends DocumentAnnotator { def postAttrs: Iterable[Class[_]] = List(classOf[Sentence]) } -object DeterministicSentenceSegmenter extends DeterministicSentenceSegmenter { - /* - def main(args: Array[String]): Unit = { - for (filename <- args) yield { - val doc = new Document(io.Source.fromFile(filename).mkString).setName(filename) - DeterministicNormalizingTokenizer.process(doc) - DeterministicSentenceSegmenter.this.process(doc) - println(filename) - for (sentence <- doc.sentences) - print("\n\n" + sentence.tokens.map(_.string).mkString(" | ")) - print("\n\n\n") - } - } */ -} +object DeterministicSentenceSegmenter extends DeterministicSentenceSegmenter diff --git a/src/main/scala/cc/factorie/app/nlp/segment/DeterministicTokenizer.scala b/src/main/scala/cc/factorie/app/nlp/segment/DeterministicTokenizer.scala new file mode 100644 index 0000000..469b35d --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/segment/DeterministicTokenizer.scala @@ -0,0 +1,25 @@ +package cc.factorie.app.nlp.segment + +/* This version does not perform normalization, only tokenization */ + +object DeterministicTokenizer extends DeterministicLexerTokenizer( + tokenizeSgml = false, + tokenizeNewline = false, + tokenizeWhitespace = false, + tokenizeAllDashedWords = false, + abbrevPrecedesLowercase = false, + normalize = false, + normalizeQuote = false, + normalizeApostrophe = false, + normalizeCurrency = false, + normalizeAmpersand = false, + normalizeFractions = false, + normalizeEllipsis = false, + undoPennParens = false, + unescapeSlash = false, + unescapeAsterisk = false, + normalizeMDash = false, + normalizeDash = false, + normalizeHtmlSymbol = false, + normalizeHtmlAccent = false +) diff --git a/src/main/scala/cc/factorie/app/nlp/segment/OntonotesNormalizedTokenString.scala b/src/main/scala/cc/factorie/app/nlp/segment/OntonotesNormalizedTokenString.scala new file mode 100644 index 0000000..e36d49b --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/segment/OntonotesNormalizedTokenString.scala @@ -0,0 +1,9 @@ +package cc.factorie.app.nlp.segment + +import cc.factorie.app.nlp.Token + +/** + * Created by andrew@andrewresearch.net on 27/10/17. + */ + +class OntonotesNormalizedTokenString(token:Token, str:String) extends PlainNormalizedTokenString(token, str) \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/nlp/segment/OntonotesTokenNormalizer.scala b/src/main/scala/cc/factorie/app/nlp/segment/OntonotesTokenNormalizer.scala new file mode 100644 index 0000000..3a3bf02 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/segment/OntonotesTokenNormalizer.scala @@ -0,0 +1,11 @@ +package cc.factorie.app.nlp.segment + +import cc.factorie.app.nlp.Token + +object OntonotesTokenNormalizer extends TokenNormalizer1((t:Token, s:String) => new OntonotesNormalizedTokenString(t,s)) { + override def processToken(token:Token): Unit = { + super.processToken(token) + // TODO Add more normalization here (not yet sure what needed), but keep Lemma issues separate! + // coexist -> co-exist + } +} diff --git a/src/main/scala/cc/factorie/app/nlp/segment/PhraseTokenizer.scala b/src/main/scala/cc/factorie/app/nlp/segment/PhraseTokenizer.scala deleted file mode 100644 index 41895ce..0000000 --- a/src/main/scala/cc/factorie/app/nlp/segment/PhraseTokenizer.scala +++ /dev/null @@ -1,116 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.app.nlp.segment - -import cc.factorie.app.nlp._ - -import scala.collection.mutable -import scala.collection.mutable.ArrayBuffer - -/** - * User: apassos - * Date: 8/19/13 - * Time: 12:58 PM - */ - -/** - * A sequence of sections which are tokenized as phrases. - */ -class PhraseSectionList extends ArrayBuffer[Section] - -class PhraseTrie { - class TrieKey - case class StringKey(s: String) extends TrieKey - object EndKey extends TrieKey - val map = new mutable.HashMap[TrieKey, PhraseTrie] - def add(phrase: Seq[String]) { - if (phrase.length > 0) { - val child = map.getOrElseUpdate(StringKey(phrase.head), new PhraseTrie) - child.add(phrase.tail) - } else { - map(EndKey) = new PhraseTrie - } - } - - def canEnd = if (map.contains(EndKey)) 0 else -1 - - def findLongestPhraseLength(tokens: Seq[Token], position: Int): Int = { - math.max(canEnd, map.get(StringKey(tokens(position).string)) match { - case None => -1 - case Some(trie) => - if (position + 1 < tokens.length) { - val len = trie.findLongestPhraseLength(tokens, position+1) - if (len >= 0) len + 1 - else -1 - } else -1 - }) - } -} - -object PhraseTokenizerModes extends scala.Enumeration { - type PhraseTokenizerMode = Value - val REPLACE_SECTIONS, ADD_TO_SECTIONS, ADD_SEPARATELY = Value -} - -/** - * A tokenizer which will merge existing tokens if they are from one of the phrases given. - * - * Efficiently uses a trie-like data structure to simulate the finite automaton for - * tokenization. The behavior is that if there is a long and a short phrase with the same prefix - * the longer one will be picked greedily. - * - * This version gets all attributes from the last token in the phrase. - * - * @param phrases The set of phrases to be picked. - * @param mode The mode. If ADD_SEPARATELY the new sections are only added to the attribute. - * If ADD_TO_SECTIONS the new sections are added to the document. - * IF REPLACE_SECTIONS the existing sections in the document are replaced. - */ -class PhraseTokenizer(phrases: Iterable[Seq[String]], val mode: PhraseTokenizerModes.PhraseTokenizerMode = PhraseTokenizerModes.ADD_SEPARATELY) extends DocumentAnnotator { - val trie = new PhraseTrie - phrases.foreach(trie.add) - def prereqAttrs = Seq(classOf[Token]) - def postAttrs = Seq(classOf[PhraseSectionList]) - def tokenAnnotationString(token: Token) = null - - def process(document: Document): Document = { - val newSections = new PhraseSectionList - document.attr += newSections - for (section <- document.sections) { - val newSection = new BasicSection(section.document, section.stringStart, section.stringEnd) - newSections += newSection - val tokens = section.tokens - var i = 0 - while (i < tokens.length) { - trie.findLongestPhraseLength(tokens, i) match { - case -1 => val t = new Token(newSection, tokens(i).stringStart, tokens(i).stringEnd) - tokens(i).attr.values.foreach(t.attr.+=) - i += 1 - case 0 => throw new Error(s"Found a single-token phrase in the dictionary, should not happen. Offending phrase: ${tokens(i).string}") - case n => - val t = new Token(newSection, tokens(i).stringStart, tokens(i+n-1).stringEnd) - tokens(i+n-1).attr.values.foreach(t.attr.+=) - i += n - } - } - } - mode match { - case PhraseTokenizerModes.ADD_TO_SECTIONS => newSections.foreach(document.+=) - case PhraseTokenizerModes.REPLACE_SECTIONS => - document.clearSections() - newSections.foreach(document.+=) - case PhraseTokenizerModes.ADD_SEPARATELY => - } - document - } -} diff --git a/src/main/scala/cc/factorie/app/nlp/segment/PlainNormalizedTokenString.scala b/src/main/scala/cc/factorie/app/nlp/segment/PlainNormalizedTokenString.scala new file mode 100644 index 0000000..08ef6da --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/segment/PlainNormalizedTokenString.scala @@ -0,0 +1,5 @@ +package cc.factorie.app.nlp.segment + +import cc.factorie.app.nlp.{Token, TokenString} + +class PlainNormalizedTokenString(token:Token, str:String) extends TokenString(token, str) diff --git a/src/main/scala/cc/factorie/app/nlp/segment/PlainTokenNormalizer.scala b/src/main/scala/cc/factorie/app/nlp/segment/PlainTokenNormalizer.scala new file mode 100644 index 0000000..bc37ea2 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/segment/PlainTokenNormalizer.scala @@ -0,0 +1,5 @@ +package cc.factorie.app.nlp.segment + +import cc.factorie.app.nlp.Token + +object PlainTokenNormalizer extends TokenNormalizer1((t:Token, s:String) => new PlainNormalizedTokenString(t,s)) \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/nlp/segment/PunktSentenceSegmenter.scala b/src/main/scala/cc/factorie/app/nlp/segment/PunktSentenceSegmenter.scala deleted file mode 100644 index 031695e..0000000 --- a/src/main/scala/cc/factorie/app/nlp/segment/PunktSentenceSegmenter.scala +++ /dev/null @@ -1,749 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -/* -package cc.factorie.app.nlp.segment - -import java.util.regex.Pattern - -import scala.annotation.tailrec -import scala.collection._ - -sealed trait TokenType -case object S extends TokenType -// Sentence boundary marker -case object A extends TokenType -// Abbreviation marker -case object AS extends TokenType -// Abbreviation at end of sentence marker -case object U extends TokenType -// Unknown - -object PunktSentenceSegmenter { - - object Punkt { - - val ORTHO_BEG_UC = 1 - val ORTHO_MID_UC = 1 << 2 - val ORTHO_UNK_UC = 1 << 3 - val ORTHO_BEG_LC = 1 << 4 - val ORTHO_MID_LC = 1 << 5 - val ORTHO_UNK_LC = 1 << 6 - val ORTHO_UC = ORTHO_BEG_UC | ORTHO_MID_UC | ORTHO_UNK_UC - val ORTHO_LC = ORTHO_BEG_LC | ORTHO_MID_LC | ORTHO_UNK_LC - - def hasFlag(flagSet: Int, testFlag: Int): Boolean = (flagSet & testFlag) != 0 - - sealed trait OrthoContext - case object Initial extends OrthoContext - case object Internal extends OrthoContext - case object Unknown extends OrthoContext - - sealed trait Case - case object Upper extends Case - case object Lower extends Case - case object Non extends Case - - val orthoMap = Map[(OrthoContext, Case), Int]( - (Initial, Upper) -> ORTHO_BEG_UC - , (Internal, Upper) -> ORTHO_MID_UC - , (Unknown, Upper) -> ORTHO_UNK_UC - , (Initial, Lower) -> ORTHO_BEG_LC - , (Internal, Lower) -> ORTHO_MID_LC - , (Unknown, Lower) -> ORTHO_UNK_LC) - - class PunktLanguageVars { - val sentenceEndChars = Set(".", "?", "!") - def sentenceEndCharsRegex = "[%s]".format(Pattern.quote(sentenceEndChars.mkString)) - val internalPunctuation = ",:;" - val boundaryRealignmentRegex = """(?s)["')\]}]+?(?:\s+|(?=--)|$)""".r - val wordStartRegex = """[^\("\`{\[:;&\#\*@\)}\]\-,]""" - val nonWordChars = """(?:[?!)";}\]\*:@'\({\[])""" - val multiCharPunctuationRegex = """(?:\-{2,}|\.{2,}|(?:\.\s){2,}\.)""" - val wordTokenizeTemplate = """(?x)( - %2$s - | - (?=%3$s)\S+? # Accept word characters until end is found - (?= # Sequences marking a word's end - \s| # White-space - $| # End-of-string - %1$s|%2$s| # Punctuation - ,(?=$|\s|%1$s|%2$s) # Comma if at end of word - ) - | - \S - )""" //.replaceAll("\\s+", "") - - lazy val wordTokenizerRegex = { - val re = wordTokenizeTemplate.format(nonWordChars, multiCharPunctuationRegex, wordStartRegex) -// println(re) - re.r - } - - def wordTokenize(s: String) = wordTokenizerRegex.findAllIn(s) - - val periodContextTemplate = """ - [^\s]* - %2$s - (?=( - %1$s - | - \s+([^\s]+) - ))""".replaceAll("\\s+", "") - - lazy val periodContextRegex = { - val re = periodContextTemplate.format(nonWordChars, sentenceEndCharsRegex) -// println(re) - re.r - } - } - - val nonPunctuationRegex = """[^\W\d]""".r - - def iteratePairs[T](it: Iterable[T]): Iterable[(T, T)] = it.toSeq.sliding(2).filter(_.length > 1).map({case Seq(x, y) => (x, y)}).toIterable - - class PunktParameters { - var abbrevTypes = mutable.Set[String]() - var collocations = mutable.Set[(String, String)]() - var sentenceStarters = mutable.Set[String]() - var orthoContext = makeOrthoContext - - def makeOrthoContext = new mutable.HashMap[String, Int]() { - override def default(key: String) = 0 - } - - def clearAbbrevs() = abbrevTypes = mutable.Set[String]() - def clearCollocations() = collocations = mutable.Set[(String, String)]() - def clearSentenceStarters() = sentenceStarters = mutable.Set[String]() - def clearOrthoContext() = orthoContext = makeOrthoContext - def addOrthoContext(typ: String, flag: Int) = orthoContext.update(typ, orthoContext(typ) | flag) - } - - object PunktToken { - val ellipsisRegex = """\.\.+$""".r - val numericRegex = """^-?[\.,]?\d[\d,\.-]*\.?$""".r - val initialRegex = """[^\W\d]\.$""".r - val alphaRegex = """[^\W\d]+$""".r - } - - class PunktToken( - val token: String, - var paraStart: Boolean = false, - var lineStart: Boolean = false, - var sentenceBreak: Boolean = false, - var abbr: Boolean = false, - var ellipsis: Boolean = false) { - - import PunktToken._ - - val periodFinal = token.endsWith(".") - - def getType(tk: String) = - if ( {val fst = tk(0); fst == '.' || fst == '-' || fst.isDigit}) - numericRegex.replaceAllIn(tk.toLowerCase, "##number##") - else - tk.toLowerCase - - val ty = getType(token) - - def typeNoPeriod = if (ty.length > 1 && ty.last == '.') ty.dropRight(1) else ty - def typeNoSentPeriod = if (sentenceBreak) typeNoPeriod else ty - def firstUpper = token(0).isUpper - def firstLower = token(0).isLower - def firstCase = if (firstUpper) Upper else if (firstLower) Lower else Non - def isEllipsis = ellipsisRegex.pattern.matcher(token).matches - def isNumber = ty.startsWith("##number##") - def isInitial = initialRegex.pattern.matcher(token).matches - def isAlpha = alphaRegex.pattern.matcher(token).matches - def isNonPunctuation = nonPunctuationRegex.findFirstIn(ty).isDefined - - def serialize: String = sys.error("unimplemented") - - override def toString: String = { - var res = token - if (abbr) res += "" - if (ellipsis) res += "" - if (sentenceBreak) res += "" - res - } - } - - abstract class PunktBase( - val languageVars: PunktLanguageVars = new PunktLanguageVars(), - parms: PunktParameters = new PunktParameters()) { - - private[this] var p = parms - - def params_=(parms: PunktParameters) = p = parms - def params = p - - def tokenizeWords(plainText: String): mutable.ArrayBuffer[PunktToken] = { - val tokens = new mutable.ArrayBuffer[PunktToken]() - var paraStart = false - val lineIter = plainText.split('\n').iterator - while (lineIter.hasNext) { - val line = lineIter.next() - val stripped = line.trim - if (stripped.isEmpty) { - paraStart = true - } else { - val lineTokens = languageVars.wordTokenize(line) - val firstToken = lineTokens.next() - if (firstToken != "") - tokens += new PunktToken(firstToken, paraStart = paraStart, lineStart = true) - paraStart = false - while (lineTokens.hasNext) { - val tk = lineTokens.next() - if (tk != "") - tokens += new PunktToken(tk) - } - } - } - tokens - } - - def annotateFirstPass(tokens: Iterable[PunktToken]): Unit = - tokens.foreach(firstPassAnnotation(_)) - - def firstPassAnnotation(pt: PunktToken) = { - val tok = pt.token - if (languageVars.sentenceEndChars.contains(tok)) - pt.sentenceBreak = true - else if (pt.isEllipsis) - pt.ellipsis = true - else if (pt.periodFinal && !tok.endsWith("..")) - if (params.abbrevTypes.contains(tok.dropRight(1).toLowerCase) || - params.abbrevTypes.contains(tok.dropRight(1).toLowerCase.split("-").last)) - pt.abbr = true - else - pt.sentenceBreak = true - } - } - - class UnigramFreqDist extends mutable.HashMap[String, Int] { - override def default(key: String) = 0 - def thresholdFreq(threshold: Int): UnigramFreqDist = { - val res = new UnigramFreqDist - var numRemoved = 0 - for ((tok, count) <- this) { - if (count > threshold) numRemoved += 1 - else res(tok) += count - } - res(null) += numRemoved - res - } - } - - class BigramFreqDist extends mutable.HashMap[(String, String), Int] { - override def default(key: (String, String)) = 0 - def thresholdFreq(threshold: Int): BigramFreqDist = { - val res = new BigramFreqDist - var numRemoved = 0 - for ((tok, count) <- this) { - if (count > threshold) numRemoved += 1 - else res(tok) += count - } - res(null) += numRemoved - res - } - } - - class PunktTrainer( - val trainText: Option[String] = None, - val verbose: Boolean = false, - languageVars: PunktLanguageVars = new PunktLanguageVars(), - params: PunktParameters = new PunktParameters()) - extends PunktBase(languageVars, params) { - - var typeFreqDist = new UnigramFreqDist() - var sentenceStarterFreqDist = new UnigramFreqDist() - var collocationFreqDist = new BigramFreqDist() - - var numPeriodTokens = 0 - var sentenceBreakCount = 0 - var finalized = false - - val ABBREV = 0.3 - var IGNORE_ABBREV_PENALTY = false - var ABBREV_BACKOFF = 5 - var COLLOCATION = 7.88 - var SENT_STARTER = 30 - var INCLUDE_ALL_COLLOCS = false - var INCLUDE_ABBREV_COLLOCS = false - var MIN_COLLOC_FREQ = 1 - - if (trainText.isDefined) train(trainText.get, verbose, finalize = true) - - def train(text: String, verbose: Boolean = false, finalize: Boolean = true) = { - trainTokensLogic(tokenizeWords(text), verbose) - if (finalize) finalizeTraining(verbose) - } - - def trainTokens(tokens: mutable.ArrayBuffer[PunktToken], verbose: Boolean = false, finalize: Boolean = true) = { - trainTokensLogic(tokens, verbose) - if (finalize) finalizeTraining(verbose) - } - - private def trainTokensLogic(tokens: mutable.ArrayBuffer[PunktToken], verbose: Boolean = false, finalize: Boolean = true) = { - finalized = false - - val tokIter = tokens.iterator - while (tokIter.hasNext) { - val tok = tokIter.next() - typeFreqDist(tok.ty) += 1 - if (tok.periodFinal) numPeriodTokens += 1 - } - - val uniqueTypes = this.uniqueTypes(tokens) - val reclassIter = reclassifyAbbrevTypes(uniqueTypes.toList).iterator - while (reclassIter.hasNext) { - val (abbr, score, isAdd) = reclassIter.next() - if (score >= ABBREV) { - if (isAdd) { - params.abbrevTypes += abbr - if (verbose) println(" Abbreviation (isAdd: %s): [%6.4f] %s" format(isAdd, score, abbr)) - } - } else if (!isAdd) { - params.abbrevTypes -= abbr - if (verbose) println(" Removed abbreviation: [%6.4f] %s" format(score, abbr)) - } - } - - annotateFirstPass(tokens) - annotateOrthographyData(tokens) - sentenceBreakCount += getSentenceBreakCount(tokens) - - val pairIter = iteratePairs(tokens).iterator - while (pairIter.hasNext) { - val (tok1, tok2) = pairIter.next() - if (tok1.periodFinal) { - if (isRareAbbrevType(tok1, tok2)) { - params.abbrevTypes += tok1.typeNoPeriod - if (verbose) println(" Rare Abbrev: %s" format tok1.ty) - } - if (isPotentialSentenceStarter(tok1, tok2)) - sentenceStarterFreqDist(tok2.ty) += 1 - if (isPotentialCollocation(tok1, tok2)) - collocationFreqDist((tok1.typeNoPeriod, tok2.typeNoSentPeriod)) += 1 - } - } - } - - def finalizeTraining(verbose: Boolean = false): Unit = { - params.clearSentenceStarters() - for ((ty, ll) <- findSentenceStarters()) { - params.sentenceStarters += ty - if (verbose) println(" Sent Starter: [%6.4f] %s" format(ll, ty)) - } - params.clearCollocations() - for (((ty1, ty2), ll) <- findCollocations()) { - params.collocations += ((ty1, ty2)) - if (verbose) println(" Collocation: [%6.4f] %s+%s" format(ll, ty1, ty2)) - } - finalized = true - } - - def freqThreshold(orthoThreshold: Int = 2, typeThreshold: Int = 2, collocThreshold: Int = 2, sentenceStartThreshold: Int = 2) = { - if (orthoThreshold > 1) { - val oldOc = params.orthoContext - params.clearOrthoContext() - for ((tok, count) <- typeFreqDist; if count >= orthoThreshold) - params.orthoContext(tok) = oldOc(tok) - } - - typeFreqDist = typeFreqDist.thresholdFreq(typeThreshold) - collocationFreqDist = collocationFreqDist.thresholdFreq(collocThreshold) - sentenceStarterFreqDist = sentenceStarterFreqDist.thresholdFreq(sentenceStartThreshold) - } - - def annotateOrthographyData(tokens: mutable.ArrayBuffer[PunktToken]): Unit = { - var context: OrthoContext = Internal - val tokenIter = tokens.iterator - while (tokenIter.hasNext) { - val tok = tokenIter.next() - if (tok.paraStart && context != Unknown) context = Initial - if (tok.lineStart && context == Internal) context = Unknown - val flag = orthoMap.getOrElse((context, tok.firstCase), 0) - if (flag != 0) params.addOrthoContext(tok.typeNoSentPeriod, flag) - if (tok.sentenceBreak) - if (!(tok.isNumber || tok.isInitial)) context = Initial - else context = Unknown - else if (tok.ellipsis || tok.abbr) context = Unknown - else context = Internal - } - } - - def isRareAbbrevType(tok1: PunktToken, tok2: PunktToken): Boolean = { - if (tok1.abbr || !tok1.sentenceBreak) return false - val typ = tok1.typeNoSentPeriod - val count = typeFreqDist(typ) + typeFreqDist(typ.dropRight(1)) - if (params.abbrevTypes.contains(typ) || count >= ABBREV_BACKOFF) - return false - if (languageVars.internalPunctuation.contains(tok2.token.take(1))) { - return true - } else if (tok2.firstLower) { - val typ2 = tok2.typeNoSentPeriod - val typ2OrthoContext = params.orthoContext(typ2) - if (hasFlag(typ2OrthoContext, ORTHO_BEG_UC) && !hasFlag(typ2OrthoContext, ORTHO_MID_UC)) - return true - } - false - } - - def isPotentialSentenceStarter(tok1: PunktToken, tok2: PunktToken): Boolean = - tok1.sentenceBreak && !(tok1.isNumber || tok1.isInitial) && tok2.isAlpha - - def isPotentialCollocation(tok1: PunktToken, tok2: PunktToken): Boolean = { - (INCLUDE_ALL_COLLOCS || - (INCLUDE_ABBREV_COLLOCS && tok1.abbr) || - (tok1.sentenceBreak && - (tok1.isNumber || tok1.isInitial))) && - tok1.isNonPunctuation && - tok2.isNonPunctuation - } - - def findCollocations(): mutable.ArrayBuffer[((String, String), Double)] = { - val collocations = new mutable.ArrayBuffer[((String, String), Double)]() - val typeFreqDistN = sum(typeFreqDist.values) - for (((typ1, typ2), colCount) <- collocationFreqDist; if !params.sentenceStarters.contains(typ2)) { - val typ1Count = typeFreqDist(typ1) + typeFreqDist(typ1 + ".") - val typ2Count = typeFreqDist(typ2) + typeFreqDist(typ2 + ".") - if (typ1Count > 1 && typ2Count > 1 && - MIN_COLLOC_FREQ < colCount && - colCount <= math.min(typ1Count, typ2Count)) { - val ll = colLogLikelihood(typ1Count, typ2Count, colCount, typeFreqDistN) - if (ll >= COLLOCATION && - (typeFreqDistN: Double) / typ1Count > (typ2Count: Double) / colCount) - collocations += (((typ1, typ2), ll)) - } - } - collocations - } - - def sum(xs: Iterable[Int]): Int = { - val iter = xs.iterator - var sum = 0 - while (iter.hasNext) sum += iter.next() - sum - } - - def reclassifyAbbrevTypes(uniques: List[String]): List[(String, Double, Boolean)] = { - val typeFreqDistN = sum(typeFreqDist.values) - @tailrec def loop( - uniques: List[String] = uniques, - output: List[(String, Double, Boolean)] = List()): List[(String, Double, Boolean)] = uniques match { - case curTokenType :: rest => - val isAdd = curTokenType.endsWith(".") - if (!nonPunctuationRegex.findFirstIn(curTokenType).isDefined || - curTokenType == "##number##" || - (isAdd && params.abbrevTypes.contains(curTokenType)) || - (!isAdd && !params.abbrevTypes.contains(curTokenType))) - loop(rest, output) - else { - val typ = if (isAdd) curTokenType.dropRight(1) else curTokenType - val numPeriods = typ.count(".".==) + 1 - val numNonPeriods = typ.length - numPeriods + 1 - val countWithPeriod = typeFreqDist(typ + ".") - val countWithoutPeriod = typeFreqDist(typ) - val ll = dunningLogLikelihood( - countWithPeriod + countWithoutPeriod, - numPeriodTokens, - countWithPeriod, - typeFreqDistN) - val fLength = math.exp(-numNonPeriods) - val fPeriods = numPeriods - val fPenalty = if (IGNORE_ABBREV_PENALTY) 1 else math.pow(numNonPeriods, -countWithoutPeriod) - val score = ll * fLength * fPeriods * fPenalty - loop(rest, (typ, score, isAdd) :: output) - } - case _ => output - } - loop() - } - - def dunningLogLikelihood(countA: Int, countB: Int, countAB: Int, N: Int) = { - val p1 = (countB: Double) / N - val p2 = 0.99 - val nullHypo = (countAB: Double) * math.log(p1) + (countA - countAB) * math.log(1.0 - p1) - val altHypo = (countAB: Double) * math.log(p2) + (countA - countAB) * math.log(1.0 - p2) - val likelihood = nullHypo - altHypo - -2.0 * likelihood - } - - def colLogLikelihood(countA: Int, countB: Int, countAB: Int, N: Int) = { - val p = (countB: Double) / N - val p1 = (countAB: Double) / countA - val p2 = (countB - countAB: Double) / (N - countA) - val summand1 = countAB * math.log(p) + (countA - countAB) * math.log(1.0 - p) - val summand2 = (countB - countAB) * math.log(p) + (N - countA - countB + countAB) * math.log(1.0 - p) - val summand3 = - if (countA == countAB) 0 - else countAB * math.log(p1) + (countA - countAB) * math.log(1.0 - p1) - val summand4 = - if (countB == countAB) 0 - else (countB - countAB) * math.log(p2) + (N - countA - countB + countAB) * math.log(1.0 - p2) - val likelihood = summand1 + summand2 - summand3 - summand4 - -2.0 * likelihood - } - - def findAbbrevTypes() = { - params.clearAbbrevs() - val tokens = typeFreqDist.keys.filter(ty => ty != null && ty.endsWith(".")).toList - for ((abbr, score, isAdd) <- reclassifyAbbrevTypes(tokens); if score >= ABBREV) - params.abbrevTypes += abbr - } - - def uniqueTypes(tokens: Iterable[PunktToken]) = { - val uniques = new mutable.HashSet[String]() - val iter = tokens.iterator - while (iter.hasNext) - uniques += iter.next().ty - uniques - } - - def findSentenceStarters(): Iterable[(String, Double)] = { - val typeFreqDistN = sum(typeFreqDist.values) - for { - (typ, typAtBreakCount) <- sentenceStarterFreqDist - if typ != null - typCount = typeFreqDist(typ) + typeFreqDist(typ + ".") - if typCount >= typAtBreakCount - ll = colLogLikelihood(sentenceBreakCount, typCount, typAtBreakCount, typeFreqDistN) - if ll >= SENT_STARTER && (typeFreqDistN: Double) / sentenceBreakCount > (typCount: Double) / typAtBreakCount - } yield (typ, ll) - } - - def getSentenceBreakCount(tokens: Iterable[PunktToken]) = tokens.count(_.sentenceBreak) - } - - class PunktSentenceTokenizer( - val trainText: Option[String] = None, - val verbose: Boolean = false, - languageVars: PunktLanguageVars = new PunktLanguageVars, - parms: PunktParameters = new PunktParameters()) extends PunktBase(languageVars, parms) { - - val PUNCTUATION = Set(";", ":", ",", ".", "!", "?") - - if (trainText != None) super.params_=(train(trainText.get, verbose)) - - def train(trainText: String, verbose: Boolean = false) = - new PunktTrainer(Some(trainText), verbose, languageVars, params).params - - def sentencesFromText(text: String, realignBoundaries: Boolean = false) = { - var sents = slicesFromText(text).map({case (s1, s2, _) => text.substring(s1, s2)}) - if (realignBoundaries) sents = this.realignBoundaries(sents) - sents - } - - def annotateTokens(tokens: Iterable[PunktToken]): Iterable[PunktToken] = { - annotateFirstPass(tokens) - // println(tokens) - // println(tokens.map(_.ty)) - annotateSecondPass(tokens) - // println(tokens) - // println(tokens.map(_.ty)) - tokens - } - - def buildSentenceList(text: String, tokens: mutable.ArrayBuffer[PunktToken]): mutable.ArrayBuffer[String] = { - val output = new mutable.ArrayBuffer[String]() - var pos = 0 - val wsRegex = """\s*""".r - var sentence = "" - for (token <- tokens) { - var tok = token.token - val wsMatcher = wsRegex.pattern.matcher(text.substring(pos)) - val ws = if (wsMatcher.matches) wsMatcher.group(0) else "" - pos += ws.length - if (text.substring(pos, pos + tok.length) != tok) { - val pat = tok.map(c => Pattern.quote(c.toString)).mkString( """\s*""") - val m = pat.r.pattern.matcher(text.substring(pos)) - if (m.matches) tok = m.group(0) - } - - pos += tok.length - sentence += (if (sentence != "") ws + tok else tok) - if (token.sentenceBreak) { - output += sentence - sentence = "" - } - } - if (sentence != "") output += sentence - output - } - - def annotateSecondPass(tokens: Iterable[PunktToken]): Unit = - for ((t1, t2) <- iteratePairs(tokens)) secondPassAnnotation(t1, t2) - - def secondPassAnnotation(tok1: PunktToken, tok2: PunktToken): Unit = { - if (!tok1.periodFinal) return - val typ = tok1.typeNoPeriod - val nextType = tok2.typeNoSentPeriod - val tokIsInitial = tok1.isInitial - - if (params.collocations.contains((typ, nextType))) { - tok1.sentenceBreak = false - tok1.abbr = true - return - } - - if ((tok1.abbr || tok1.ellipsis) && !tokIsInitial) { - val isSentenceStarter = orthoHeuristic(tok2) - if (isSentenceStarter.isDefined && isSentenceStarter.get) { - tok1.sentenceBreak = true - return - } - if (tok2.firstUpper && params.sentenceStarters.contains(nextType)) { - tok1.sentenceBreak = true - return - } - } - - if (tokIsInitial || typ == "##number##") { - val isSentenceStarter = orthoHeuristic(tok2) - if (isSentenceStarter.isDefined && !isSentenceStarter.get) { - tok1.sentenceBreak = false - tok1.abbr = true - return - } - if (!isSentenceStarter.isDefined && tokIsInitial && - tok2.firstUpper && - !hasFlag(params.orthoContext(nextType), ORTHO_LC)) { - tok1.sentenceBreak = false - tok1.abbr = true - } - } - } - - def orthoHeuristic(tok: PunktToken): Option[Boolean] = { - if (PUNCTUATION.contains(tok.token)) - Some(false) - else { - val orthoContext = params.orthoContext(tok.typeNoSentPeriod) - if (tok.firstUpper && hasFlag(orthoContext, ORTHO_LC) && !hasFlag(orthoContext, ORTHO_MID_UC)) - Some(true) - else if (tok.firstLower && (hasFlag(orthoContext, ORTHO_UC) || !hasFlag(orthoContext, ORTHO_BEG_LC))) - Some(false) - else - None - } - } - - def textContainsSentenceBreak(text: String): Option[PunktToken] = { - val annotated = annotateTokens(tokenizeWords(text)) - // println(annotated) - annotated.dropRight(1).find(_.sentenceBreak) - } - - def slicesFromText(text: String): mutable.ArrayBuffer[(Int, Int, TokenType)] = { - var lastBreak = 0 - val output = new mutable.ArrayBuffer[(Int, Int, TokenType)]() - val mIter = languageVars.periodContextRegex.findAllIn(text).matchData - while (mIter.hasNext) { - val m = mIter.next() - val context = m.group(0) + m.group(1) - // println(context) - val break = textContainsSentenceBreak(context) - if (break.isDefined) { - output += ((lastBreak, m.end, if (break.get.abbr) AS else S)) - lastBreak = if (m.groupNames.length > 2) m.start(2) else m.end - } - } - output += ((lastBreak, text.length, S)) - output - } - - def realignBoundaries(sents: mutable.ArrayBuffer[String]): mutable.ArrayBuffer[String] = { - var realign = 0 - val output = new mutable.ArrayBuffer[String]() - for ((s1Unfixed, s2) <- iteratePairs(sents)) { - val s1 = s1Unfixed.substring(realign, s1Unfixed.length) - val m = languageVars.boundaryRealignmentRegex.findFirstMatchIn(s2) - if (m.isDefined) { - output += (s1 + m.get.group(0).trim) - realign = m.get.end - } else { - realign = 0 - output += s1 - } - } - output - } - - def tokenize(text: String, realignBoundaries: Boolean = false) = sentencesFromText(text, realignBoundaries) - - def spanTokenize(text: String) = slicesFromText(text) - } - } - - import Punkt._ - - def findSentenceBoundaries(text: String, abvSet: Set[String] = Set[String](), sentStarters: Set[String] = Set[String]()): Iterable[(Int, TokenType)] = { - val params = new PunktParameters - params.abbrevTypes ++= abvSet - params.sentenceStarters ++= sentStarters - val tokenizer = new PunktSentenceTokenizer(trainText = Some(text), verbose = true, parms = params) - val sentenceBoundaries = tokenizer.slicesFromText(text).map({case (_, b, t) => (b, t)}) - Seq((0, S)) ++ sentenceBoundaries - } - - def findCommonAbbreviations(text: String, abvSet: Set[String] = Set[String](), sentStarters: Set[String] = Set[String]()): Set[String] = { - val params = new PunktParameters - params.abbrevTypes ++= abvSet - params.sentenceStarters ++= sentStarters - val trainer = new PunktTrainer(trainText = Some(text), params = params) - trainer.params.abbrevTypes - } - - def findCommonSentenceStarters(text: String, abvSet: Set[String] = Set[String](), sentStarters: Set[String] = Set[String]()): Set[String] = { - val params = new PunktParameters - params.abbrevTypes ++= abvSet - params.sentenceStarters ++= sentStarters - val trainer = new PunktTrainer(trainText = Some(text), params = params) - trainer.params.sentenceStarters - } - - def main(args: Array[String]): Unit = { - val text = scala.io.Source.fromFile( """C:\wsj_processed.txt""").getLines().mkString - val params = new PunktParameters - params.abbrevTypes ++= Set( - "inc", "corp", "dec", "jan", "feb", "mar", "apr", "jun", "jul", "aug", "sep", "oct", "nov", "ala", - "ariz", "ark", "colo", "conn", "del", "fla", "ill", "ind", "kans", "kan", "ken", "kent", "mass", "mich", - "minn", "miss", "mont", "nebr", "neb", "nev", "dak", "okla", "oreg", "tenn", "tex", "virg", "wash", "wis", - "wyo", "mr", "ms", "mrs", "calif", "oct", "vol", "rev", "ltd", "dea", "est", "capt", "hev", "gen", "ltd", "etc", "sci", - "comput", "univ", "ave", "cent", "col", "comdr", "cpl", "dept", "dust,", "div", "est", "gal", "gov", "hon", - "grad", "inst", "lib", "mus", "pseud", "ser", "alt", "Inc", "Corp", "Dec", "Jan", "Feb", "Mar", "Apr", - "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Ala", "Ariz", "Ark", "Colo", "Conn", "Del", "Fla", "Ill", - "Ind", "Kans", "Kan", "Ken", "Kent", "Mass", "Mich", "Minn", "Miss", "Mont", "Nebr", "Neb", "Nev", "Dak", - "Okla", "Oreg", "Tenn", "Tex", "Virg", "Wash", "Wis", "Wyo", "Mrs", "Calif", "Oct", "Vol", "Rev", "Ltd", - "Dea", "Est", "Capt", "Hev", "Gen", "Ltd", "Etc", "Sci", "Comput", "Univ", "Ave", "Cent", "Col", "Comdr", - "Cpl", "Dept", "Dust,", "Div", "Est", "Gal", "Gov", "Hon", "Grad", "Inst", "Lib", "Mus", "Pseud", "Ser", "Alt", - "Mr", "Ms") - val start = System.currentTimeMillis() - for (i <- 1 to 5) { - val tokenizer = new PunktSentenceTokenizer(trainText = Some(text), verbose = false, parms = params) - // tokenizer.params.abbrevTypes.foreach(println(_)) - val sfromt = tokenizer.sentencesFromText(text) -// println(sfromt.length) -// sfromt.foreach(println(_)) -// println(tokenizer.params.abbrevTypes) - } - println(System.currentTimeMillis() - start) - // - // val text = Source.fromFile( """C:\Users\Luke\Documents\Code\IESL\SentenceBoundaryDetector\wsj_text.txt""").getLines().mkString(" ") - // val start = System.currentTimeMillis() - // for (i <- 1 until 2) - // findSentenceBoundaries(text).foreach(println(_)) - // println(System.currentTimeMillis() - start) - } -} -*/ \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/nlp/segment/PunktTokenizer.scala b/src/main/scala/cc/factorie/app/nlp/segment/PunktTokenizer.scala deleted file mode 100644 index 83817c6..0000000 --- a/src/main/scala/cc/factorie/app/nlp/segment/PunktTokenizer.scala +++ /dev/null @@ -1,187 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -/* -package cc.factorie.app.nlp.segment - -import cc.factorie.app.nlp.{Document, DocumentAnnotator, Sentence, Token} -import cc.factorie.app.strings.StringSegmentIterator - -object DefaultRules { - val contractionsAndPossessives = """((?i)'(s|d|m|l+|ve|re)\b)|((?i)n't\b)""" - val singleLetterAcronyms = """[\p{L}]\.[\p{L}\.]*""" - val allAbbrevs = """([\p{L}]+\.)""" - val ordinals = "[0-9]{1,2}[sthnrd]+[\\-\\p{L}]+" - val notEndingInDot = "[0-9\\-.\\:/,\\+\\=%><]+[0-9\\-:/,\\+\\=%><]" - val possiblyEndingInDot = "[0-9\\-.\\:/,\\+\\=%]+" - val email = """(?i)\b[\p{L}\p{Nd}._%+-]+@[\p{L}\p{Nd}.-]+\.[A-Z]{2,4}\b""" - val url1 = """\b(https?|ftp|file)://[-\p{L}\p{Nd}+&@#/%?=~_|!:,.;]*[-\p{L}\p{Nd}+&@#/%=~_|]""" - val url2 = """\b[wW]{3}.(([-\p{L}\p{Nd}+&@#/%?=~_|!:,;]+(?=\.))\.)+[A-Za-z]{2,4}(/[-\p{L}\p{Nd}+&@#/%?=~_|!:,;]*)?""" - val finalPunctuation1 = """[.?!]["')}\]]?""" - // why does this have square and curly brackets in it?? - val finalPunctuation2 = """["')}\]]?[.?!]""" - val midSentenceQuotes = "[`'\"]+" - val otherSymbols = """[,\-:;$?&@\(\)]+""" - val alphanumericsAndHyphensPrecedingContractionsOrPossessives = """[\p{L}\p{N}\-]+(?=(?i)('(s|d|m|l+|ve|re))|(n't))""" - val wordsWithSequencesOfSingleDashesInside = "[\\w]+(-[\\w]+)*" - val wordWithNumberAndApostrophe = "[\\w']+" - - val commonAbbreviations = Set( - "inc", "corp", "dec", "jan", "feb", "mar", "apr", "jun", "jul", "aug", "sep", "oct", "nov", "ala", - "ariz", "ark", "colo", "conn", "del", "fla", "ill", "ind", "kans", "kan", "ken", "kent", "mass", "mich", - "minn", "miss", "mont", "nebr", "neb", "nev", "dak", "okla", "oreg", "tenn", "tex", "virg", "wash", "wis", - "wyo", "mr", "ms", "mrs", "calif", "oct", "vol", "rev", "ltd", "dea", "est", "capt", "hev", "gen", "ltd", "etc", "sci", - "comput", "univ", "ave", "cent", "col", "comdr", "cpl", "dept", "dust,", "div", "est", "gal", "gov", "hon", - "grad", "inst", "lib", "mus", "pseud", "ser", "alt", "Inc", "Corp", "Dec", "Jan", "Feb", "Mar", "Apr", - "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Ala", "Ariz", "Ark", "Colo", "Conn", "Del", "Fla", "Ill", - "Ind", "Kans", "Kan", "Ken", "Kent", "Mass", "Mich", "Minn", "Miss", "Mont", "Nebr", "Neb", "Nev", "Dak", - "Okla", "Oreg", "Tenn", "Tex", "Virg", "Wash", "Wis", "Wyo", "Mrs", "Calif", "Oct", "Vol", "Rev", "Ltd", - "Dea", "Est", "Capt", "Hev", "Gen", "Ltd", "Etc", "Sci", "Comput", "Univ", "Ave", "Cent", "Col", "Comdr", - "Cpl", "Dept", "Dust,", "Div", "Est", "Gal", "Gov", "Hon", "Grad", "Inst", "Lib", "Mus", "Pseud", "Ser", "Alt", - "Mr", "Ms") - - val commonSentenceStarters = Set("The") - - val defaultRuleset = Seq( - contractionsAndPossessives - , singleLetterAcronyms - , allAbbrevs - , ordinals - , possiblyEndingInDot - , email - , url1 - , url2 - , finalPunctuation1 - , finalPunctuation2 - , midSentenceQuotes - , otherSymbols - , alphanumericsAndHyphensPrecedingContractionsOrPossessives - , wordsWithSequencesOfSingleDashesInside - , wordWithNumberAndApostrophe) - - val defaultRulesetNoSentenceBoundaries = Seq( - contractionsAndPossessives - , singleLetterAcronyms - , ordinals - , notEndingInDot - , email - , url1 - , url2 - , commonAbbreviations.mkString("|") - , finalPunctuation1 - , finalPunctuation2 - , midSentenceQuotes - , otherSymbols - , alphanumericsAndHyphensPrecedingContractionsOrPossessives - , wordsWithSequencesOfSingleDashesInside - , wordWithNumberAndApostrophe) -} - -sealed trait SentenceBoundaryInference -case object PerDocument extends SentenceBoundaryInference -case object JointlyAcrossDocuments extends SentenceBoundaryInference -case object Non extends SentenceBoundaryInference - -object PunktTokenizer extends PunktTokenizer - -class PunktTokenizer extends DocumentAnnotator { - - def tokenAnnotationString(token: Token) = token.string + "\t" - - def commonAbbreviations: Set[String] = DefaultRules.commonAbbreviations - def commonSentenceStarters: Set[String] = DefaultRules.commonSentenceStarters - def sentenceBoundaryInference: SentenceBoundaryInference = JointlyAcrossDocuments - - def ruleset: Seq[String] = - if (sentenceBoundaryInference == Non) DefaultRules.defaultRulesetNoSentenceBoundaries - else DefaultRules.defaultRuleset - - private[this] val regex = ruleset.mkString("|").r - -// def apply(s: String): StringSegmentIterator = new StringSegmentIterator { -// val doc = new Document(s) -// process(doc) -// var i = 0 -// val len = doc.tokens.length -// def hasNext = i < len - 1 -// def next: String = { val result = doc.tokens(i).string; i += 1; result } -// def start = doc.tokens(i).stringStart -// def end = doc.tokens(i).stringEnd -// //doc.tokens.map(_.string).iterator -// } - - def apply(s: String): StringSegmentIterator = new StringSegmentIterator { - val tokenIterator = for (section <- process(new Document(s)).sections.iterator; token <- section.tokens.iterator) yield token - var token: Token = null - def hasNext = tokenIterator.hasNext - def next(): String = { token = tokenIterator.next(); token.string } - def start = token.stringStart - def end = token.stringEnd - } - - - // TODO Fix this to fit better into the DocumentProcessor framework, e.g. setting postAttrs - def process(documents: Seq[Document]): Unit = processLogic(documents, sentenceBoundaryInference) - - def process(document: Document): Document = { processLogic(Seq(document), sentenceBoundaryInference); document } - def prereqAttrs: Iterable[Class[_]] = Nil - def postAttrs: Iterable[Class[_]] = Vector[Class[_]](classOf[Token], classOf[Sentence]) - - // TODO Fix to obey document.sections! -akm - private[this] def processLogic(documents: Seq[Document], inference: SentenceBoundaryInference): Unit = inference match { - case PerDocument => documents.foreach(d => processLogic(Seq(d), JointlyAcrossDocuments)) - case Non => - for (d <- documents; section <- d.sections) { - val tokenIterator = regex.findAllIn(section.string) - while (tokenIterator.hasNext) { - tokenIterator.next() - new Token(d, tokenIterator.start, tokenIterator.end) - } - new Sentence(section, 0, d.tokenCount) - } - case JointlyAcrossDocuments => - val docString = documents.map(_.string).mkString(" ") - val sentenceSegmented = PunktSentenceSegmenter.findSentenceBoundaries(docString, abvSet = commonAbbreviations).toArray - var tokensSoFar = 0 - var d = 0 - var currentDocument = documents(d) - var docOffset = 0 - val segmentsIterator = sentenceSegmented.sliding(2) - while (segmentsIterator.hasNext) { - var Array((start, _), (end, endTy)) = segmentsIterator.next() - val endIsAbbrev = endTy == AS || endTy == A - /* end isn't an abbrev, so remove the period by making end -= 1 and then in the regex you can have all things containing '.' be abbrevs */ - if (!endIsAbbrev) {end -= 1} - if (end > docOffset + currentDocument.string.length + 1) { - d += 1 - docOffset += currentDocument.string.length + 1 - currentDocument = documents(d) - tokensSoFar = 0 - } - val currentDocumentOffset = start - docOffset - val tokenIterator = regex.findAllIn(docString.substring(start, end)) - var numTokens = 0 - while (tokenIterator.hasNext) { - tokenIterator.next() - new Token(currentDocument, math.max(0, currentDocumentOffset + tokenIterator.start), currentDocumentOffset + tokenIterator.end) // really? - numTokens += 1 - } - if (!endIsAbbrev) { - new Token(currentDocument.asSection, end - docOffset, end + 1 - docOffset) - numTokens += 1 - } - new Sentence(currentDocument.asSection, tokensSoFar, numTokens) // really? - tokensSoFar += numTokens - } - } -} -*/ diff --git a/src/main/scala/cc/factorie/app/nlp/segment/TokenNormalizer.scala b/src/main/scala/cc/factorie/app/nlp/segment/TokenNormalizer1.scala similarity index 64% rename from src/main/scala/cc/factorie/app/nlp/segment/TokenNormalizer.scala rename to src/main/scala/cc/factorie/app/nlp/segment/TokenNormalizer1.scala index cfec10a..8aaefe9 100644 --- a/src/main/scala/cc/factorie/app/nlp/segment/TokenNormalizer.scala +++ b/src/main/scala/cc/factorie/app/nlp/segment/TokenNormalizer1.scala @@ -1,18 +1,6 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - package cc.factorie.app.nlp.segment -import cc.factorie.app.nlp._ + +import cc.factorie.app.nlp.{Document, DocumentAnnotator, Token, TokenString} import scala.reflect.ClassTag @@ -20,23 +8,23 @@ import scala.reflect.ClassTag The aim here is to to put into plain text, the way most people would write an email message, e.g. un-escaped asterisks, plain quote characters, etc. */ class TokenNormalizer1[A<:TokenString]( - val newTokenString: (Token,String) => A, // potentially a specialized subclass of TokenString, to reflect different choices here. - val normalizeQuote:Boolean = true, // Convert all double quotes to " - val normalizeApostrophe:Boolean = true, // Convert all apostrophes to ', even within token strings - val normalizeCurrency:Boolean = true, // Convert all currency symbols to "$", except cents symbol to "cents" - val normalizeAmpersand:Boolean = true, // Convert all ampersand symbols (including "&" to "&" - val normalizeFractions:Boolean = true, // Convert unicode fraction characters to their spelled out analogues, like "3/4" - val normalizeEllipsis:Boolean = true, // Convert unicode ellipsis character to spelled out analogue, "..." - val undoPennParens:Boolean = true, // Change -LRB- etc to "(" etc. - val unescapeSlash:Boolean = true, // Change \/ to / - val unescapeAsterisk:Boolean = true, // Change \* to * - val normalizeMDash:Boolean = true, // Convert all em-dashes to double dash -- - val normalizeDash:Boolean = true, // Convert all other dashes to single dash - - val normalizeHtmlSymbol:Boolean = true, // Convert < to <, etc - val normalizeHtmlAccent:Boolean = true, // Convert Beyoncé to Beyonce - val americanize:Boolean = false - )(implicit m:ClassTag[A]) extends DocumentAnnotator { - + val newTokenString: (Token,String) => A, // potentially a specialized subclass of TokenString, to reflect different choices here. + val normalizeQuote:Boolean = true, // Convert all double quotes to " + val normalizeApostrophe:Boolean = true, // Convert all apostrophes to ', even within token strings + val normalizeCurrency:Boolean = true, // Convert all currency symbols to "$", except cents symbol to "cents" + val normalizeAmpersand:Boolean = true, // Convert all ampersand symbols (including "&" to "&" + val normalizeFractions:Boolean = true, // Convert unicode fraction characters to their spelled out analogues, like "3/4" + val normalizeEllipsis:Boolean = true, // Convert unicode ellipsis character to spelled out analogue, "..." + val undoPennParens:Boolean = true, // Change -LRB- etc to "(" etc. + val unescapeSlash:Boolean = true, // Change \/ to / + val unescapeAsterisk:Boolean = true, // Change \* to * + val normalizeMDash:Boolean = true, // Convert all em-dashes to double dash -- + val normalizeDash:Boolean = true, // Convert all other dashes to single dash - + val normalizeHtmlSymbol:Boolean = true, // Convert < to <, etc + val normalizeHtmlAccent:Boolean = true, // Convert Beyoncé to Beyonce + val americanize:Boolean = false + )(implicit m:ClassTag[A]) extends DocumentAnnotator { + val dashRegex = ("\\A("+DeterministicRegexTokenizer.dash+")+\\Z").r val mdashRegex = ("\\A("+DeterministicRegexTokenizer.mdash+")+\\Z").r //val quote = "``|''|[\u2018\u2019\u201A\u201B\u201C\u201D\u0091\u0092\u0093\u0094\u201A\u201E\u201F\u2039\u203A\u00AB\u00BB]{1,2}|[`\"\u201C\u201D\\p{Pf}]|$quot;|(?:['\u0092\u2019]|'){1,2}" @@ -49,7 +37,7 @@ class TokenNormalizer1[A<:TokenString]( val htmlSymbolMap = new scala.collection.mutable.HashMap[String,String] { override def default(s:String) = s } ++= List("<" -> "<", ">" -> ">", "&" -> "&", "©" -> "(c)", "®" -> "(r)", "™" -> "(TM)", "’" -> "'", "‘" -> "'") // TODO complete this collection - + // TODO Normalize to `` and '' for better PosTag prediction, etc. def processToken(token:Token): Unit = { val string = token.string @@ -86,21 +74,4 @@ class TokenNormalizer1[A<:TokenString]( override def tokenAnnotationString(token:Token): String = null def prereqAttrs: Iterable[Class[_]] = List(classOf[Token]) def postAttrs: Iterable[Class[_]] = List(m.runtimeClass) -} - -class PlainNormalizedTokenString(token:Token, str:String) extends TokenString(token, str) -object PlainTokenNormalizer extends TokenNormalizer1((t:Token, s:String) => new PlainNormalizedTokenString(t,s)) - -class OntonotesNormalizedTokenString(token:Token, str:String) extends PlainNormalizedTokenString(token, str) -object OntonotesTokenNormalizer extends TokenNormalizer1((t:Token, s:String) => new OntonotesNormalizedTokenString(t,s)) { - override def processToken(token:Token): Unit = { - super.processToken(token) - // TODO Add more normalization here (not yet sure what needed), but keep Lemma issues separate! - // coexist -> co-exist - } -} - -object BritishToAmerican extends scala.collection.mutable.HashMap[String,String] { - this("colour") = "color" - // TODO Add more, e.g. see http://oxforddictionaries.com/us/words/british-and-american-spelling -} +} \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/nlp/wordnet/Synset.scala b/src/main/scala/cc/factorie/app/nlp/wordnet/Synset.scala new file mode 100644 index 0000000..81a564c --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/wordnet/Synset.scala @@ -0,0 +1,23 @@ +package cc.factorie.app.nlp.wordnet + +import scala.collection.mutable + +class Synset(val id: String, val hyps: Set[String], val ants: Set[String], wn: WordNet) { + def antonyms(): Set[Synset] = this.ants.map(x => wn.allSynsets(x)) + + /* get the parent synsets (hypernyms) of this synset */ + def hypernyms(): Set[Synset] = this.hyps.map(x => wn.allSynsets(x)) + + /* recursively get all parent synsets (hypernyms) of this synset */ + def allHypernyms(): Set[Synset] = { + val result = mutable.Set[Synset]() + def visit(s: Synset) { + if (!result.contains(s)) { + result.add(s) + s.hypernyms().foreach(visit) + } + } + visit(this) + result.toSet + } +} diff --git a/src/main/scala/cc/factorie/app/nlp/wordnet/WordNet.scala b/src/main/scala/cc/factorie/app/nlp/wordnet/WordNet.scala index f069cfd..0f41770 100644 --- a/src/main/scala/cc/factorie/app/nlp/wordnet/WordNet.scala +++ b/src/main/scala/cc/factorie/app/nlp/wordnet/WordNet.scala @@ -13,10 +13,10 @@ package cc.factorie.app.nlp.wordnet +import cc.factorie.app.nlp.lemma.WordNetLemmatizer import cc.factorie.util.ClasspathURL import scala.collection.immutable.HashMap -import scala.collection.mutable import scala.collection.mutable.ArrayBuffer import scala.io.Source @@ -41,7 +41,7 @@ class WordNet(val inputStreamFactory: String=>java.io.InputStream) { * all of the data and index files and extracts information from them * To speed this up, we can combine the wnLemmatizer intialization and the * WordNet intialization used below */ - val wnLemmatizer = new cc.factorie.app.nlp.lemma.WordNetLemmatizer(inputStreamFactory) + val wnLemmatizer = new WordNetLemmatizer(inputStreamFactory) /* There are 2 types of files we deal with here for wordnet: 1) the data file - this file has 1 line per synset and @@ -190,25 +190,7 @@ class WordNet(val inputStreamFactory: String=>java.io.InputStream) { } -class Synset(val id: String, val hyps: Set[String], val ants: Set[String], wn: WordNet) { - def antonyms(): Set[Synset] = this.ants.map(x => wn.allSynsets(x)) - /* get the parent synsets (hypernyms) of this synset */ - def hypernyms(): Set[Synset] = this.hyps.map(x => wn.allSynsets(x)) - - /* recursively get all parent synsets (hypernyms) of this synset */ - def allHypernyms(): Set[Synset] = { - val result = mutable.Set[Synset]() - def visit(s: Synset) { - if (!result.contains(s)) { - result.add(s) - s.hypernyms().foreach(visit) - } - } - visit(this) - result.toSet - } -} object WordNet extends WordNet(s => ClasspathURL.fromDirectory[WordNet](s).openConnection().getInputStream) diff --git a/src/main/scala/cc/factorie/app/strings/PorterStemmer.scala b/src/main/scala/cc/factorie/app/strings/PorterStemmer.scala index e6be802..a95a895 100644 --- a/src/main/scala/cc/factorie/app/strings/PorterStemmer.scala +++ b/src/main/scala/cc/factorie/app/strings/PorterStemmer.scala @@ -10,14 +10,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -//package cc.factorie.app.strings +package cc.factorie.app.strings + +import scala.io.Source /** * Rewritten from http://tartarus.org/martin/PorterStemmer/scala.txt * for thread-safety and style (but definitely not too pretty yet). * @author Brian Martin */ -/* + object PorterStemmer { val vowels = "aeiou" val step1aVals = List(("sses", "ss"), ("ies","i"), ("ss","ss"), ("s", "")) @@ -139,7 +141,7 @@ object PorterStemmer { def apply(s:String): String = applySteps(s) def main(args: Array[String]): Unit = { - def getOWPL(f: String) = io.Source.fromFile(f).getLines().toSeq.map(_.trim) + def getOWPL(f: String) = Source.fromFile(f).getLines().toSeq.map(_.trim) if (args.length != 2) println("Expected arguments are a OWPL file of unstemmed and a OWPL file of properly stemmed words to check against.\n" + @@ -160,4 +162,3 @@ object PorterStemmer { stemmed.zip(trueStemmed).take(20).foreach(s => println("sample: " + s._1 + " " + s._2)) } } -*/ \ No newline at end of file diff --git a/src/main/scala/cc/factorie/app/strings/package.scala b/src/main/scala/cc/factorie/app/strings/package.scala index 0663c6e..988317f 100644 --- a/src/main/scala/cc/factorie/app/strings/package.scala +++ b/src/main/scala/cc/factorie/app/strings/package.scala @@ -13,6 +13,8 @@ package cc.factorie.app +import cc.factorie.app.nlp.lexicon.NumberWords + package object strings { /** Read the entire contents of the InputStream with the given encoding, and return them as a String. */ @@ -77,10 +79,10 @@ package object strings { else word } def collapseDigits(word:String): String = { - if (cc.factorie.app.nlp.lexicon.NumberWords.containsWord(word) || containsDigitRegex.findFirstIn(word).nonEmpty) "0" else word + if (NumberWords.containsWord(word) || containsDigitRegex.findFirstIn(word).nonEmpty) "0" else word } def replaceDigits(word:String): String = { - if (cc.factorie.app.nlp.lexicon.NumberWords.containsWord(word)) "" else digitsRegex.replaceAllIn(word, "0") + if (NumberWords.containsWord(word)) "" else digitsRegex.replaceAllIn(word, "0") } /** Implements Levenshtein Distance, with specific operation costs to go from this String to String s2. */ diff --git a/src/main/scala/cc/factorie/app/topics/lda/Document.scala b/src/main/scala/cc/factorie/app/topics/lda/Document.scala index 9450587..c98d8d1 100644 --- a/src/main/scala/cc/factorie/app/topics/lda/Document.scala +++ b/src/main/scala/cc/factorie/app/topics/lda/Document.scala @@ -12,8 +12,9 @@ limitations under the License. */ package cc.factorie.app.topics.lda -import java.io.{BufferedReader, File, FileInputStream, InputStreamReader, PrintWriter, Reader, StringReader} +import java.io._ +import cc.factorie.app.nlp.lexicon.{StopWords, TriePhraseLexicon} import cc.factorie.variable._ import scala.collection.mutable.ArrayBuffer @@ -102,7 +103,7 @@ class Document(val domain:CategoricalSeqDomain[String], var name:String, tokens: } object Document { - import cc.factorie.app.nlp.lexicon.{StopWords, TriePhraseLexicon} + import cc.factorie.app.strings.{StringSegmenter, alphaSegmenter} import scala.util.control.Breaks._ diff --git a/src/main/scala/cc/factorie/util/ModelProvider.scala b/src/main/scala/cc/factorie/util/ModelProvider.scala index 28e130c..23dacee 100644 --- a/src/main/scala/cc/factorie/util/ModelProvider.scala +++ b/src/main/scala/cc/factorie/util/ModelProvider.scala @@ -14,16 +14,16 @@ package cc.factorie.util import java.io._ import java.net.URL -import java.nio.file.{Paths, Path} +import java.nio.file.{Path, Paths} import cc.factorie.app.nlp.lexicon.LexiconsProvider import scala.language.implicitConversions -import scala.reflect.{ClassTag, classTag} import scala.reflect.runtime.universe.{TypeTag, typeTag} +import scala.reflect.{ClassTag, classTag} /** - * [[ModelProvider]] is a generic trait that serves to provide trained models (and/or lexicons) to factorie classes + * [[ModelProvider]] is a generic trait that serves to provide trained models (and/or lexicon) to factorie classes * without access to the classpath or system properties of the JVM within which factorie is running. This should replace * the existing idiom of using [[cc.factorie.util.ClasspathURL]] to resolve models. [[ModelProvider.classpath]] provides * the functionality previously supplied by [[cc.factorie.util.ClasspathURL]]. ModelProvider provides resources as diff --git a/src/main/scala/cc/factorie/util/namejuggler/PersonNameFormat.scala b/src/main/scala/cc/factorie/util/namejuggler/PersonNameFormat.scala index edbbbad..d2d200d 100644 --- a/src/main/scala/cc/factorie/util/namejuggler/PersonNameFormat.scala +++ b/src/main/scala/cc/factorie/util/namejuggler/PersonNameFormat.scala @@ -13,9 +13,9 @@ package cc.factorie.util.namejuggler -import cc.factorie.util.ModelProvider import cc.factorie.app.nlp.lexicon.TrieUnionLexicon import cc.factorie.app.nlp.lexicon.iesl.{PersonFirstHigh, PersonFirstHighest} +import cc.factorie.util.ModelProvider import cc.factorie.util.namejuggler.StringUtils._ object PersonNameFormat { diff --git a/src/main/scala/io/nlytx/factorie/nlp/api/DocumentAnnotator.scala b/src/main/scala/io/nlytx/factorie/nlp/api/DocumentAnnotator.scala index d417ebf..0a291a8 100644 --- a/src/main/scala/io/nlytx/factorie/nlp/api/DocumentAnnotator.scala +++ b/src/main/scala/io/nlytx/factorie/nlp/api/DocumentAnnotator.scala @@ -1,14 +1,51 @@ package io.nlytx.factorie.nlp.api -import cc.factorie.app.nlp.{DocumentAnnotatorPipeline, parse, pos, ner} +import cc.factorie.app.nlp.lexicon.{LexiconsProvider, StaticLexicons} +import cc.factorie.app.nlp.ner.StaticLexiconFeatures +import cc.factorie.app.nlp.parse.OntonotesTransitionBasedParser +import cc.factorie.app.nlp.pos.OntonotesForwardPosTagger +import cc.factorie.app.nlp.{DocumentAnnotatorPipeline, coref, ner} +import cc.factorie.util.{ClasspathURL, ModelProvider} + /** * Created by andrew@andrewresearch.net on 24/10/17. */ object DocumentAnnotator { - val default = DocumentAnnotatorPipeline(pos.OntonotesForwardPosTagger, parse.WSJTransitionBasedParser) + private val slf = new StaticLexiconFeatures(new StaticLexicons()(LexiconsProvider.classpath()), "en") + + private val nerMp = ModelProvider.classpath[ner.ConllChainNer]() + private val nerTagger = new ner.ConllChainNer()(nerMp,slf) + System.setProperty( + classOf[ner.ConllChainNer].getName, + ClasspathURL[ner.ConllChainNer](".factorie").getPath + ) + + //private val pMp = ModelProvider.classpath[OntonotesPhraseEntityTypeLabeler]() + // private val phraseLabeler = new OntonotesPhraseEntityTypeLabeler() //(pMp) +// System.setProperty( +// classOf[OntonotesPhraseEntityTypeLabeler].getName, +// ClasspathURL[OntonotesPhraseEntityTypeLabeler](".factorie").getPath +// ) + + private val forCoref = coref.ForwardCoref + + private val posTagger = OntonotesForwardPosTagger + private val parser = OntonotesTransitionBasedParser + + + val pipeline = DocumentAnnotatorPipeline(posTagger,parser,nerTagger,forCoref) + + + + + + //BilouOntonotesNerChunkAnnotator, + //NerForwardCoref, + //ConllPatternBasedRelationFinder //lemma.WordNetLemmatizer //coref.NerStructuredCoref //phrase.PosBasedNounPhraseFinder + } diff --git a/src/main/scala/io/nlytx/factorie/nlp/api/DocumentBuilder.scala b/src/main/scala/io/nlytx/factorie/nlp/api/DocumentBuilder.scala index b849de1..8fc01ed 100644 --- a/src/main/scala/io/nlytx/factorie/nlp/api/DocumentBuilder.scala +++ b/src/main/scala/io/nlytx/factorie/nlp/api/DocumentBuilder.scala @@ -1,7 +1,7 @@ package io.nlytx.factorie.nlp.api import cc.factorie.app.nlp.Document -import cc.factorie.app.nlp.coref.MentionList + /** * Created by andrew@andrewresearch.net on 24/10/17. @@ -9,25 +9,51 @@ import cc.factorie.app.nlp.coref.MentionList class DocumentBuilder { - lazy val annotator = DocumentAnnotator.default + lazy val pipeline = DocumentAnnotator.pipeline def createAnnotatedDoc(text:String):Document = { + pipeline.profile = true val doc = new Document(text) - annotator.process(doc) + pipeline.process(doc) + println(pipeline.profileReport) doc } - def getDetails(doc:Document):String = { - - val owplString:String = doc.owplString(annotator.annotators.map(p => p.tokenAnnotationString(_))) - val mentions:String = doc.attr[MentionList].map { m => - val phrase = m.phrase - val mentionAnnotations = annotator.annotators.map(a => a.mentionAnnotationString(m)).mkString(", ") - phrase + ">> " + mentionAnnotations - }.mkString(", ") - val docAnnotations:String = annotator.annotators.map(a => a.documentAnnotationString(doc)).mkString(", ") - - s"owpl: $owplString\n mentions: $mentions\n docAnnotations: $docAnnotations\n" - } - -} \ No newline at end of file +// def url[T: TypeTag] = { +// //val prefix = "models/" +// val suffix = ".model" +// val name = typeOf[T].typeSymbol.fullName +// val path = name + suffix +// println(s"Path: $path") +// this.getClass.getClassLoader.getResource(path) +// } + + +// def getDetails(doc:Document):String = { +// +// val owplString:String = doc.owplString(pipeline.annotators.map(p => p.tokenAnnotationString(_))) +// val mentions:String = doc.attr[MentionList].map { m => +// val phrase = m.phrase +// val mentionAnnotations = pipeline.annotators.map(a => a.mentionAnnotationString(m)).mkString(", ") +// phrase + ">> " + mentionAnnotations +// }.mkString(", ") +// val docAnnotations:String = pipeline.annotators.map(a => a.documentAnnotationString(doc)).mkString(", ") +// +// s"owpl: $owplString\n mentions: $mentions\n docAnnotations: $docAnnotations\n" +// } + +} + +// Example usages: +// token.sentence.attr[ParseTree].parent(token) +// sentence.attr[ParseTree].children(token) +// sentence.attr[ParseTree].setParent(token, parentToken) +// sentence.attr[ParseTree].label(token) +// sentence.attr[ParseTree].label(token).set("SUBJ") + +// Methods also created in Token supporting: +// token.parseParent +// token.setParseParent(parentToken) +// token.parseChildren +// token.parseLabel +// token.leftChildren \ No newline at end of file diff --git a/src/test/resources/conll-ner-input b/src/test/resources/conll-ner-input deleted file mode 100644 index d1e27e8..0000000 --- a/src/test/resources/conll-ner-input +++ /dev/null @@ -1,65 +0,0 @@ --DOCSTART- -X- -X- O - -EU NNP I-NP I-ORG -rejects VBZ I-VP O -German JJ I-NP I-MISC -call NN I-NP O -to TO I-VP O -boycott VB I-VP O -British JJ I-NP I-MISC -lamb NN I-NP O -. . O O - -Peter NNP I-NP I-PER -Blackburn NNP I-NP I-PER - -BRUSSELS NNP I-NP I-LOC -1996-08-22 CD I-NP O - -The DT I-NP O -European NNP I-NP I-ORG -Commission NNP I-NP I-ORG -said VBD I-VP O -on IN I-PP O -Thursday NNP I-NP O -it PRP B-NP O -disagreed VBD I-VP O -with IN I-PP O -German JJ I-NP I-MISC -advice NN I-NP O -to TO I-PP O -consumers NNS I-NP O -to TO I-VP O -shun VB I-VP O -British JJ I-NP I-MISC -lamb NN I-NP O -until IN I-SBAR O -scientists NNS I-NP O -determine VBP I-VP O -whether IN I-SBAR O -mad JJ I-NP O -cow NN I-NP O -disease NN I-NP O -can MD I-VP O -be VB I-VP O -transmitted VBN I-VP O -to TO I-PP O -sheep NN I-NP O -. . O O - --DOCSTART- -X- -X- O - -CRICKET NNP I-NP O -- : O O -LEICESTERSHIRE NNP I-NP I-ORG -TAKE NNP I-NP O -OVER IN I-PP O -AT NNP I-NP O -TOP NNP I-NP O -AFTER NNP I-NP O -INNINGS NNP I-NP O -VICTORY NN I-NP O -. . O O - -LONDON NNP I-NP I-LOC -1996-08-30 CD I-NP O diff --git a/src/test/resources/parser-test-input b/src/test/resources/parser-test-input deleted file mode 100644 index 98bc4f9..0000000 --- a/src/test/resources/parser-test-input +++ /dev/null @@ -1,38 +0,0 @@ -1 Right right right RB RB _ _ 2 2 advmod advmod _ _ _ -2 now now now RB RB sem=TMP _ 5 5 advmod advmod _ 5:AM-TMP _ -3 they they they PRP PRP _ _ 5 5 nsubj nsubj _ 5:A0 _ -4 're be be VBP VBP _ _ 5 5 aux aux _ _ _ -5 pursuing pursue pursue VBG VBG pb=pursue.01|vn=51.6|ws=pursue.1 _ 0 0 root root _ _ _ -6 evidence evidence evidence NN NN ws=evidence.3 _ 5 5 dobj dobj _ 5:A1 _ -7 . . . . . _ _ 5 5 punct punct _ _ _ - -1 Out out out IN RP sem=LOC p2=IN 16 16 prep advmod _ 16:AM-LOC _ -2 in in in IN IN _ _ 1 1 prep prep _ _ _ -3 the the the DT DT _ _ 5 5 det det _ _ _ -4 oil oil oil NN NN ws=oil.1 _ 5 5 nn nn _ _ _ -5 fields field field NNS NNS ws=field.1 _ 2 2 pobj pobj _ _ _ -6 , , , , , _ _ 16 16 punct punct _ _ _ -7 if if if IN IN _ _ 9 9 mark mark _ _ _ -8 activity activity activity NN NN ws=activity.1 _ 9 9 nsubj nsubj _ 9:A1 _ -9 picks pick pick VBZ VBZ pb=pick.05|syn=ADV|ws=pick.7.4 _ 16 16 advcl advcl _ 16:AM-ADV _ -10 up up up RB RP syn=CLR p2=RB 9 9 advmod prt _ 9:C-V _ -11 much much much RB RB _ _ 12 12 advmod advmod _ _ _ -12 more more more RBR JJR _ p2=RBR 9 9 advmod advmod _ 9:A2 _ -13 , , , , , _ _ 16 16 punct punct _ _ _ -14 shortages shortage shortage NNS NNS _ _ 16 16 nsubj nsubj _ 16:A1 _ -15 could could could MD MD _ _ 16 16 aux aux _ 16:AM-MOD _ -16 appear appear appear VB VB pb=appear.01|vn=48.1.1|ws=appear.2 _ 0 0 root root _ _ _ -17 because because because IN IN _ _ 25 25 mark mark _ _ _ -18 so so so RB RB _ _ 19 19 advmod advmod _ _ _ -19 many many many JJ JJ _ _ 20 20 amod amod _ _ _ -20 roughnecks roughneck roughneck NNS NNS _ p2=RB 25 25 nsubj nsubj _ 25:A0 _ -21 , , , , , _ _ 20 20 punct punct _ _ _ -22 roustabouts roustabout roustabout NNS NNS _ _ 20 20 conj conj _ _ _ -23 and and and CC CC _ _ 22 22 cc cc _ _ _ -24 others other other NNS NNS _ _ 22 22 conj conj _ _ _ -25 left leave leave VBD VBD pb=leave.01|sem=PRP|vn=51.2-1|ws=leave.1 p2=VBN 16 16 advcl advcl _ 16:AM-CAU _ -26 after after after IN IN sem=TMP p2=RB 25 25 prep prep _ 25:AM-TMP _ -27 the the the DT DT _ _ 28 28 det det _ _ _ -28 crash crash crash NN NN ws=crash.3 _ 26 26 pobj pobj _ _ _ -29 . . . . . _ _ 16 16 punct punct _ _ _ - diff --git a/src/test/scala/cc/factorie/TestExamples.scala b/src/test/scala/cc/factorie/TestExamples.scala deleted file mode 100644 index f6b24f3..0000000 --- a/src/test/scala/cc/factorie/TestExamples.scala +++ /dev/null @@ -1,135 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie -import java.io._ - -import org.junit.Test - -/** - * User: apassos - * Date: 10/10/12 - */ -class TestExamples { - - val emptyArgs = Array[String]() - - // Returns the name of a new temporary file with the specified - def dummyFileWithContents(prefix: String, content: String): String = { - val name = java.io.File.createTempFile("FactorieTestFile", prefix).getAbsolutePath - val writer = new BufferedWriter(new FileWriter(name)) - writer.write(content) - writer.close() - name - } - - def dummyDirectoryWithFileWithContents(prefix: String, content: String, ext: String = ".txt"): String = { - val dir = java.io.File.createTempFile("FactorieTestFile", prefix) - new File(dir.getAbsolutePath + "2").mkdirs() - val n1 = dir.getAbsolutePath + "2" + java.io.File.separator + "f1" + ext - val writer = new BufferedWriter(new FileWriter(n1)) - writer.write(content) - writer.close() - dir.getAbsolutePath+"2" - } - - val dummyNERFile = dummyFileWithContents("train", "A NN C I-PER\nA NNS D O\nA NNP C I-LOC") - - @Test def testChainNER1ML() { - cc.factorie.tutorial.ChainNERExample.main(Array(dummyNERFile, dummyNERFile)) - } - - @Test def testDirichletDemo() { - cc.factorie.tutorial.DirichletDemo.main(emptyArgs) - } - - val dummyDir1 = dummyDirectoryWithFileWithContents("documentDir1", "I am a file\n") - val dummyDir2 = dummyDirectoryWithFileWithContents("documentDir2", "I am a other file\n") - - @Test def testDocumentClassifier1() { - cc.factorie.tutorial.DocumentClassifier1.main(Array(dummyDir1, dummyDir2)) - } - - val posFile = dummyFileWithContents("POS", "\nHello NN\nWorld NN\n") - - @Test def testForwardBackwardPOS() { - cc.factorie.tutorial.ForwardBackwardPOS.main(Array("--train", posFile, "--dev", posFile, "--test", posFile)) - } - - @Test def testGaussianDemo() { - cc.factorie.tutorial.GaussianDemo.main(emptyArgs) - } - - @Test def testGaussianMixtureDemo() { - cc.factorie.tutorial.GaussianMixtureDemo.main(emptyArgs) - } - - @Test def testMultivariateGaussianDemo() { - cc.factorie.tutorial.MultivariateGaussianDemo.main(emptyArgs) - } - - @Test def testMultivariateGaussianMixtureDemo() { - cc.factorie.tutorial.MultivariateGaussianMixtureDemo.main(emptyArgs) - } - - @Test def testGrid() { - cc.factorie.tutorial.Grid.main(emptyArgs) - } - - @Test def testSimpleLDA() { - cc.factorie.tutorial.SimpleLDA.main(Array(dummyDir1)) - } - - @Test def testEfficientLDA() { - cc.factorie.tutorial.EfficientLDA.main(Array(dummyDir1)) - } - - @Test def testTopicsOverTime() { - cc.factorie.tutorial.TopicsOverTime.main(Array(dummyDir1, dummyDir2)) - } - - @Test def testMultinomialDemo() { - cc.factorie.tutorial.MultinomialDemo.main(emptyArgs) - } - - @Test def testTutorialVariables() { - cc.factorie.tutorial.TutorialVariables.main(emptyArgs) - } - - @Test def testTutorialDomain() { - cc.factorie.tutorial.TutorialDomain.main(emptyArgs) - } - - @Test def testTutorialFactors() { - cc.factorie.tutorial.TutorialFactors.main(emptyArgs) - } - - @Test def testTutorialFamily() { - cc.factorie.tutorial.TutorialFamily.main(emptyArgs) - } - - @Test def testTutorialModel() { - cc.factorie.tutorial.TutorialModel.main(emptyArgs) - } - - @Test def testTutorialLearning() { - cc.factorie.tutorial.TutorialLearning.main(emptyArgs) - } - - @Test def testTutorialParallelismAndHyperparameters() { - cc.factorie.tutorial.TutorialParallelismAndHyperparameters.main(emptyArgs) - } - - @Test def testWordSegmenter() { - cc.factorie.tutorial.WordSegmenter.main(emptyArgs) - } -} diff --git a/src/test/scala/cc/factorie/TestSerialize.scala b/src/test/scala/cc/factorie/TestSerialize.scala deleted file mode 100644 index c067ba5..0000000 --- a/src/test/scala/cc/factorie/TestSerialize.scala +++ /dev/null @@ -1,359 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -/* Copyright (C) 2008-2014 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie - -import java.io._ - -import cc.factorie.app.chain.ChainModel -import cc.factorie.app.nlp -import cc.factorie.app.nlp.ner.NerTag -import cc.factorie.la._ -import cc.factorie.model._ -import cc.factorie.util.BinarySerializer -import cc.factorie.variable._ -import org.junit.Test -import org.scalatest.junit.JUnitSuite - -import scala.collection.mutable.ArrayBuffer -import scala.language.postfixOps - -class TestSerialize extends JUnitSuite with cc.factorie.util.FastLogging{ - - class MyChainNerFeatures(val token: nlp.Token, override val domain: CategoricalVectorDomain[String]) - extends BinaryFeatureVectorVariable[String] { - override def skipNonCategories = true - } - - class OntoNerLabel(token: nlp.Token, ta: String, val domain: CategoricalDomain[String]) extends NerTag(token, ta) { - type ContainedVariableType = this.type - } - - @Test def testTensorSerialization(): Unit = { - val random = new scala.util.Random(0) - val tensorFile = java.io.File.createTempFile("FactorieTestFile", "serialize-tensor").getAbsolutePath - val tensor = new SparseIndexedTensor2(100, 20) - for (i <- 0 until tensor.length) tensor(i) = random.nextDouble() - BinarySerializer.serialize(tensor, tensorFile) - val newTensor = BinarySerializer.deserialize[SparseIndexedTensor2](tensorFile) - assert(tensor.toSeq.sameElements(newTensor.toSeq)) - - val tensors = Seq(new DenseTensor3(100,1,4) , new SparseIndexedTensor2(100, 20)) - for (t <- tensors; i <- 0 until t.length) t(i) = random.nextDouble() - BinarySerializer.serialize(tensors, tensorFile) - val newTensors = BinarySerializer.deserialize[Seq[Tensor]](tensorFile) - assert(tensors.zip(newTensors).forall({case (t1, t2) => t1.toSeq.sameElements(t2.toSeq)})) - } - - @Test def testOutOfOrderDomainSerialization(): Unit = { - val random = new scala.util.Random(0) - val file = java.io.File.createTempFile("foo", "multi") - object MyChainNerFeaturesDomain extends CategoricalVectorDomain[String] - MyChainNerFeaturesDomain.dimensionDomain ++= Seq("A","B","C") - - object OntoNerLabelDomain extends CategoricalDomain[String] - OntoNerLabelDomain ++= Seq("Hello","GoodBye") - - val model = makeModel(MyChainNerFeaturesDomain, OntoNerLabelDomain) - model.bias.weights.value := Array.fill[Double](model.bias.weights.value.length)(random.nextDouble()) - model.obs.weights.value := Array.fill[Double](model.obs.weights.value.length)(random.nextDouble()) - model.markov.weights.value := Array.fill[Double](model.markov.weights.value.length)(random.nextDouble()) - - BinarySerializer.serialize(model, MyChainNerFeaturesDomain, OntoNerLabelDomain, file) - - object featDomain2 extends CategoricalVectorDomain[String] - object labelDomain2 extends CategoricalDomain[String] - val model2 = makeModel(featDomain2, labelDomain2) - - BinarySerializer.deserialize(model2, featDomain2, labelDomain2, file) - - assertSameWeights(model2, model) - } - - - @Test def testChainModelSerialization(): Unit = { - val random = new scala.util.Random(0) - - val f = File.createTempFile("FactorieTestFile", "serialize-chain-model") - val modelFileOutput = new FileOutputStream(f) - - logger.debug("creating toy model with random weights") - - object MyChainNerFeaturesDomain extends CategoricalVectorDomain[String] - MyChainNerFeaturesDomain.dimensionDomain ++= Seq("A","B","C") - - object OntoNerLabelDomain extends CategoricalDomain[String] - OntoNerLabelDomain ++= Seq("Hello","GoodBye") - - val model = makeModel(MyChainNerFeaturesDomain, OntoNerLabelDomain) - model.bias.weights.value:= Array.fill[Double](model.bias.weights.value.length)(random.nextDouble()) - model.obs.weights.value:= Array.fill[Double](model.obs.weights.value.length)(random.nextDouble()) - model.markov.weights.value:= Array.fill[Double](model.markov.weights.value.length)(random.nextDouble()) - logger.debug("serializing chain model") - model.serialize(modelFileOutput) - modelFileOutput.flush() - modelFileOutput.close() - - - val modelFileInput = new FileInputStream(f) - - val deserialized = deserializeChainModel(modelFileInput) - - assertSameWeights(model, deserialized) - - logger.debug("successfully deserialized") - } - - def getWeights(model: Parameters): Seq[Tensor] = model.parameters.tensors.toSeq - - def assertSameWeights(model1: Parameters, model2: Parameters): Unit = { - val weights1 = getWeights(model1) - val weights2 = getWeights(model2) - assert(weights1.size == weights2.size, - "Number of families didn't match: model1 had %d, model2 had %d" format (weights1.size, weights2.size)) - for ((w1, w2) <- weights1.zip(weights2)) { - logger.debug("# active elements in w1: " + w1.activeDomainSize) - logger.debug("# active elements in w2: " + w2.activeDomainSize) - assert(w1.activeDomainSize == w2.activeDomainSize) - for (((a1, a2), (b1, b2)) <- w1.activeElements.toSeq.zip(w2.activeElements.toSeq)) { - assert(a1 == b1, "Index %d from w1 not equal to %d from w2" format (a1, b1)) - assert(a2 == b2, "Value %f at index %d from w1 not equal to value %f at index %d from w2" format (a2, a1, b2, b1)) - } - } - } - - def makeModel(featuresDomain: CategoricalVectorDomain[String], - labelDomain: CategoricalDomain[String]): ChainModel[OntoNerLabel, MyChainNerFeatures, nlp.Token] = { - object model extends ChainModel[OntoNerLabel, MyChainNerFeatures, nlp.Token]( - labelDomain, featuresDomain, l => l.token.attr[MyChainNerFeatures], l => l.token, t => t.attr[OntoNerLabel]) - model.useObsMarkov = false - model - } - - def deserializeChainModel(iStream: InputStream): ChainModel[OntoNerLabel, MyChainNerFeatures, nlp.Token] = { - object MyChainNerFeaturesDomain extends CategoricalVectorDomain[String] - object OntoNerLabelDomain extends CategoricalDomain[String] - val model = makeModel(MyChainNerFeaturesDomain, OntoNerLabelDomain) - model.deserialize(iStream) - model - } - - @Test def testModelSerializationWithDomains(): Unit = { - object domain1 extends CategoricalDomain[String] - val words = "The quick brown fox jumped over the lazy dog".split(" ") - words.foreach(domain1.index(_)) - - class Model1(d: CategoricalDomain[String]) extends Model with Parameters { - val family1 = new DotFamilyWithStatistics1[CategoricalVariable[String]] { - val weights = Weights(new DenseTensor1(d.length)) - } - def families: Seq[DotFamily] = Seq(family1) - def factors(v: Iterable[Var]) = Nil - } - val model = new Model1(domain1) - model.family1.weights.value(6) = 12 - - val fileName1 = java.io.File.createTempFile("foo", "domain") - val domainFile = new File(fileName1.getAbsolutePath) - val domainCubbie = new CategoricalDomainCubbie(domain1) - BinarySerializer.serialize(domainCubbie, domainFile) - - val fileName2 = java.io.File.createTempFile("foo", "model") - val modelFile = new File(fileName2.getAbsolutePath) - val modelCubbie = new WeightsSetCubbie(model.parameters) - BinarySerializer.serialize(modelCubbie, modelFile) - - object domain2 extends CategoricalDomain[String] - val model2 = new Model1(domain2) - - val domainFile2 = new File(fileName1.getAbsolutePath) - val domainCubbie2 = new CategoricalDomainCubbie(domain2) - BinarySerializer.deserialize(domainCubbie2, domainFile2) - - val modelFile2 = new File(fileName2.getAbsolutePath) - val modelCubbie2 = new WeightsSetCubbie(model2.parameters) - BinarySerializer.deserialize(modelCubbie2, modelFile2) - - assertSameWeights(model, model2) - } - - @Test def testMultipleSerialization(): Unit = { - val random = new scala.util.Random(0) - val file = java.io.File.createTempFile("foo", "multi") - object MyChainNerFeaturesDomain extends CategoricalVectorDomain[String] - MyChainNerFeaturesDomain.dimensionDomain ++= Seq("A","B","C") - - object OntoNerLabelDomain extends CategoricalDomain[String] - OntoNerLabelDomain ++= Seq("Hello","GoodBye") - - val model = makeModel(MyChainNerFeaturesDomain, OntoNerLabelDomain) - model.bias.weights.value := Array.fill[Double](model.bias.weights.value.length)(random.nextDouble()) - model.obs.weights.value := Array.fill[Double](model.obs.weights.value.length)(random.nextDouble()) - model.markov.weights.value := Array.fill[Double](model.markov.weights.value.length)(random.nextDouble()) - - BinarySerializer.serialize(MyChainNerFeaturesDomain, OntoNerLabelDomain, model, file) - - object featDomain2 extends CategoricalVectorDomain[String] - object labelDomain2 extends CategoricalDomain[String] - val model2 = makeModel(featDomain2, labelDomain2) - - BinarySerializer.deserialize(featDomain2, labelDomain2, model2, file) - - assertSameWeights(model2, model) - } - - @Test def testClassifierPosSerialization() { - val model = new app.nlp.pos.ForwardPosTagger - val fileName = java.io.File.createTempFile("FactorieTestFile", "classifier-pos").getAbsolutePath - model.serialize(fileName) - val otherModel = new app.nlp.pos.ForwardPosTagger(new File(fileName)) - } - - @Test def testInstanceSerialize(): Unit = { - implicit val random = new scala.util.Random(0) - import app.classify._ - val fileName = java.io.File.createTempFile("FactorieTestFile", "serialize-instances").getAbsolutePath - val ll = new ArrayBuffer[app.classify.Label]() - val labelDomain = new CategoricalDomain[String] { } - val featuresDomain = new CategoricalVectorDomain[String] { } - for (i <- 1 to 100) { - val labelName = (i % 2).toString - val features = new BinaryFeatures(labelName, i.toString, featuresDomain, labelDomain) - (1 to 100).shuffle.take(50).map(_.toString).foreach(features +=) - ll += new app.classify.Label(labelName, features, labelDomain) - } - val llFile = new File(fileName) - val llCubbie = new LabelListCubbie(featuresDomain, labelDomain, true) - llCubbie.store(ll) - BinarySerializer.serialize(llCubbie, llFile) - - val newllCubbie = new LabelListCubbie(featuresDomain, labelDomain, true) - BinarySerializer.deserialize(newllCubbie, llFile) - val newll = newllCubbie.fetch() - - assert(newll.zip(ll).forall({ - case (newl, oldl) => - newl.labelName == oldl.labelName && - newl.features.value.activeElements.sameElements(oldl.features.value.activeElements) - })) - } - - @Test def test(): Unit = { - val fileName = java.io.File.createTempFile("FactorieTestFile", "serialize-model").getAbsolutePath - val fileName2 = java.io.File.createTempFile("FactorieTestFile", "serialize-domain").getAbsolutePath - // Read data and create Variables - val sentences = for (string <- data.toList) yield { - val sentence = new Sentence - var beginword = true - for (c <- string.toLowerCase) { - if (c >= 'a' && c <= 'z') { - sentence += new Token(c, beginword) - beginword = false - } else - beginword = true - } - for (token <- sentence.links) { - if (token.hasPrev) token += (token.prev.char + "@-1") else token += "START@-1" - if (token.hasNext) token += (token.next.char + "@1") else token += "END@+1" - } - sentence - } - logger.debug("TokenDomain.dimensionDomain.size=" + TokenDomain.dimensionDomain.size) - - val model = new SegmenterModel - model.bias.weights.value += new UniformTensor1(model.bias.weights.value.dim1, 1.0) - model.obs.weights.value += new la.UniformTensor2(model.obs.weights.value.dim1, model.obs.weights.value.dim2, 1.0) - - val modelFile = new File(fileName) - - BinarySerializer.serialize(new WeightsSetCubbie(model.parameters), modelFile) - - val deserializedModel = new SegmenterModel - BinarySerializer.deserialize(new WeightsSetCubbie(deserializedModel.parameters), modelFile) - - val domainFile = new File(fileName2) - - BinarySerializer.serialize(new CategoricalVectorDomainCubbie(TokenDomain), domainFile) - - logger.debug("Original model family weightsSet: ") - getWeights(model).foreach(s => logger.debug(s.toString)) - logger.debug("Deserialized model family weightsSet: ") - getWeights(deserializedModel).foreach(s => logger.debug(s.toString)) - - assertSameWeights(model, deserializedModel) - - logger.debug("Original domain:") - logger.debug(TokenDomain.dimensionDomain.toSeq.mkString(",")) - logger.debug("Deserialized domain:") - val newDomain = new CategoricalVectorDomain[String] { } - val cubbie = new CategoricalVectorDomainCubbie(newDomain) - BinarySerializer.deserialize(cubbie, domainFile) - logger.debug(newDomain.dimensionDomain.toSeq.mkString(",")) - - assert(TokenDomain.dimensionDomain.toSeq.map(_.category).sameElements(newDomain.dimensionDomain.toSeq.map(_.category))) - } - - class Label(b: Boolean, val token: Token) extends LabeledBooleanVariable(b) - object TokenDomain extends CategoricalVectorDomain[String] - class Token(val char: Char, isWordStart: Boolean) extends BinaryFeatureVectorVariable[String] with ChainLink[Token, Sentence] { - def domain = TokenDomain - val label = new Label(isWordStart, this) - this += char.toString - if ("aeiou".contains(char)) this += "VOWEL" - } - class Sentence extends Chain[Sentence, Token] - - class SegmenterModel extends Model with Parameters { - val bias = new DotFamilyWithStatistics1[Label] { - factorName = "Label" - val weights = Weights(new la.DenseTensor1(BooleanDomain.size)) - } - val obs = new DotFamilyWithStatistics2[Label, Token] { - factorName = "Label,Token" - val weights = Weights(new la.DenseTensor2(BooleanDomain.size, TokenDomain.dimensionSize)) - } - def factors(label: Iterable[Var]): Iterable[Factor] = { - Seq.empty[Factor] - } - } - - val data = Array( - "Free software is a matter of the users' freedom to run, copy, distribute, study, change and improve the software. More precisely, it refers to four kinds of freedom, for the users of the software.", - "The freedom to run the program, for any purpose.", - "The freedom to study how the program works, and adapt it to your needs.", - "The freedom to redistribute copies so you can help your neighbor.", - "The freedom to improve the program, and release your improvements to the public, so that the whole community benefits.", - "A program is free software if users have all of these freedoms. Thus, you should be free to redistribute copies, either with or without modifications, either gratis or charging a fee for distribution, to anyone anywhere. Being free to do these things means (among other things) that you do not have to ask or pay for permission.", - "You should also have the freedom to make modifications and use them privately in your own work or play, without even mentioning that they exist. If you do publish your changes, you should not be required to notify anyone in particular, or in any particular way.", - "In order for the freedoms to make changes, and to publish improved versions, to be meaningful, you must have access to the source code of the program. Therefore, accessibility of source code is a necessary condition for free software.", - "Finally, note that criteria such as those stated in this free software definition require careful thought for their interpretation. To decide whether a specific software license qualifies as a free software license, we judge it based on these criteria to determine whether it fits their spirit as well as the precise words. If a license includes unconscionable restrictions, we reject it, even if we did not anticipate the issue in these criteria. Sometimes a license requirement raises an issue that calls for extensive thought, including discussions with a lawyer, before we can decide if the requirement is acceptable. When we reach a conclusion about a new issue, we often update these criteria to make it easier to see why certain licenses do or don't qualify.", - "In order for these freedoms to be real, they must be irrevocable as long as you do nothing wrong; if the developer of the software has the power to revoke the license, without your doing anything to give cause, the software is not free.", - "However, certain kinds of rules about the manner of distributing free software are acceptable, when they don't conflict with the central freedoms. For example, copyleft (very simply stated) is the rule that when redistributing the program, you cannot add restrictions to deny other people the central freedoms. This rule does not conflict with the central freedoms; rather it protects them.", - "Thus, you may have paid money to get copies of free software, or you may have obtained copies at no charge. But regardless of how you got your copies, you always have the freedom to copy and change the software, even to sell copies.", - "Rules about how to package a modified version are acceptable, if they don't effectively block your freedom to release modified versions. Rules that ``if you make the program available in this way, you must make it available in that way also'' can be acceptable too, on the same condition. (Note that such a rule still leaves you the choice of whether to publish the program or not.) It is also acceptable for the license to require that, if you have distributed a modified version and a previous developer asks for a copy of it, you must send one.", - "Sometimes government export control regulations and trade sanctions can constrain your freedom to distribute copies of programs internationally. Software developers do not have the power to eliminate or override these restrictions, but what they can and must do is refuse to impose them as conditions of use of the program. In this way, the restrictions will not affect activities and people outside the jurisdictions of these governments.", - "Finally, note that criteria such as those stated in this free software definition require careful thought for their interpretation. To decide whether a specific software license qualifies as a free software license, we judge it based on these criteria to determine whether it fits their spirit as well as the precise words. If a license includes unconscionable restrictions, we reject it, even if we did not anticipate the issue in these criteria. Sometimes a license requirement raises an issue that calls for extensive thought, including discussions with a lawyer, before we can decide if the requirement is acceptable. When we reach a conclusion about a new issue, we often update these criteria to make it easier to see why certain licenses do or don't qualify.", - "The GNU Project was launched in 1984 to develop a complete Unix-like operating system which is free software: the GNU system.") - -} diff --git a/src/test/scala/cc/factorie/TestUtils.scala b/src/test/scala/cc/factorie/TestUtils.scala deleted file mode 100644 index d1ca582..0000000 --- a/src/test/scala/cc/factorie/TestUtils.scala +++ /dev/null @@ -1,41 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - - -package cc.factorie - -trait TestUtils { - import scala.collection.Seq - - /** - * Return all combinations of n elements from - * given sequence w/o replacement - */ - def chooseN[T](seq:Seq[T], n:Int):Seq[Seq[T]] = { - // Helpers: - // [(a, 1), (b, 2), ...] -> [a, b, ..] - def removeZipIndex[T](ss:Seq[(T, Int)]) = ss.unzip._1 - // [(a, 0), (b, 3), (c, 23)] -> 23 - def lastElemZipIndex[T](s:Seq[(T, Int)]) = s.last._2 - // [(a, 0), (b, 1)] -> [a, b, c, d] -> [c, d] - def remainingSeq[T](s:Seq[(T, Int)], seq:Seq[(T, Int)]) = seq.view(lastElemZipIndex(s)+1, seq.length) - - def choose[T](n:Int, seq:Seq[(T, Int)]):Seq[Seq[(T, Int)]] = n match { - case 0 => Nil - case 1 => seq.map(_ :: Nil) - case i:Int => choose(i-1, seq) flatMap { - ll => remainingSeq(ll, seq).map { e => ll :+ e }} - } - choose(n, seq.zipWithIndex) map removeZipIndex _ - } -} diff --git a/src/test/scala/cc/factorie/app/bib/parser/TestBibtexParser.scala b/src/test/scala/cc/factorie/app/bib/parser/TestBibtexParser.scala deleted file mode 100644 index 0a57075..0000000 --- a/src/test/scala/cc/factorie/app/bib/parser/TestBibtexParser.scala +++ /dev/null @@ -1,518 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -package cc.factorie.app.bib.parser - -import java.io.File - -import cc.factorie.app.bib.parser.Dom.Name -import org.junit.Test -import org.scalatest.junit.JUnitSuite - -import scala.collection.mutable.ArrayBuffer - -class TestBibtexParser extends JUnitSuite with cc.factorie.util.FastLogging { - - def testMichaelsStuff(): Unit = { - - val path = """C:\Users\Luke\Downloads\failed\failed\failed""" - - val fileTexts = new File(path).listFiles().toList - .filter(_.isFile) - .map(f => (f.getName, scala.io.Source.fromFile(f.getPath, "ISO-8859-1").toArray.mkString)) - - val results = fileTexts map { - case (name, fileText) => - Dom.stringToDom(fileText).fold(err => - Left(""" -Error on file: "%s" -Error text: "%s" """ format (name, err)), - _ => - Right(""" -Success on file: "%s" """ format name)) - } - - val (failures, successes) = (new ArrayBuffer[String], new ArrayBuffer[String]) - results.foreach(_.fold(failures.+=, successes.+=)) - - val failuresCauseNotBibtex = failures.filter(_.contains("`@' expected")) - val failuresCauseMismatchedQuote = failures.filter(_.contains("`\"' expected but \u001A found")) - val failuresCauseBadNames = failures.filter(_.contains("fragment between commas")) - - failuresCauseNotBibtex.foreach(failures.-=) - failuresCauseMismatchedQuote.foreach(failures.-=) - failuresCauseBadNames.foreach(failures.-=) - - successes.foreach(logger.debug(_)) - failures.foreach(logger.debug(_)) - - logger.debug("Failures cause bad names:") - failuresCauseBadNames.foreach(logger.debug(_)) - - if (!failures.isEmpty) - sys.error( - "Failed! Successes: %d Failures %d FailuresCauseNotBibtex: %d FailuresCauseMismatchedQuote: %d FailuresCauseBadNames: %d" format - (successes.length, failures.length, failuresCauseNotBibtex.length, - failuresCauseMismatchedQuote.length, failuresCauseBadNames.length)) - } - - @Test def allTests(): Unit = { - - def assertParse[T](parser: DocumentParser.Impl.Parser[T], str: String): DocumentParser.Impl.ParseResult[T] = { - val result = DocumentParser.Impl.parseAll(parser, str) - assert(result.successful, result.toString + " " + result.getClass.getName) - result - } - - def assertParseAndDocify(parser: DocumentParser.Impl.Parser[List[AST.Entry]], str: String, print: Boolean = false): Unit = { - val parseResult = assertParse(parser, str) - assert(parseResult.successful, parseResult) - val res = Dom.astToDom(AST.Document(parseResult.get)) - if (print) logger.debug(res) - } - - assertParse(DocumentParser.Impl.braceDelimitedNoOuterLiteral, "{Something Great}") - assertParse(DocumentParser.Impl.literal, "{Something Great}") - assertParse(DocumentParser.Impl.literalOrSymbol, "{Something Great}") - assertParse(DocumentParser.Impl.value, "{Something Great}") - - assertParse(DocumentParser.Impl.quoteDelimitedLiteral, "\"Something Great\"") - assertParse(DocumentParser.Impl.literal, "\"Something Great\"") - assertParse(DocumentParser.Impl.literalOrSymbol, "\"Something Great\"") - assertParse(DocumentParser.Impl.value, "\"Something Great\"") - - assertParse(DocumentParser.Impl.numericLiteral, "123") - assertParse(DocumentParser.Impl.literal, "123") - assertParse(DocumentParser.Impl.literalOrSymbol, "123") - assertParse(DocumentParser.Impl.value, "123") - - assertParse(DocumentParser.Impl.SYMBOL, "asda5") - assertParse(DocumentParser.Impl.literalOrSymbol, "asda5") - assertParse(DocumentParser.Impl.value, "asda5") - - assertParse(DocumentParser.Impl.tag, "asda5 = { 132 as qwe asd }") - - assertParse(DocumentParser.Impl.value, "asda5 # asda5") - - assertParse(DocumentParser.Impl.commentEntry, "comment{wooooo!}") - - assertParse(DocumentParser.Impl.preambleEntry, "preamble{wooooo}") - - assertParse(DocumentParser.Impl.stringEntry, "string{wooooo = 1231}") - assertParse(DocumentParser.Impl.anyEntry, "@string{wooooo = 1231}") - assertParse(DocumentParser.Impl.anyEntry, "@string{ wooooo = {asd} }") - - assertParse(DocumentParser.Impl.anyEntry, "@string{ wooooo = {asd} }") - assertParse(DocumentParser.Impl.anyEntry, "@preamble{ wooooo}") - assertParse(DocumentParser.Impl.anyEntry, "@comment{ wooooo }") - - assertParse(DocumentParser.Impl.anyEntry, "@florb{ wooooo }") - assertParse(DocumentParser.Impl.anyEntry, "@florb{ wooooo, x = {y}, fg = sdf13, z = 123 }") - assertParse(DocumentParser.Impl.anyEntry, "@florb{ wooooo, x = {y}, fg = sdf13, z = 123, }") - assertParse(DocumentParser.Impl.anyEntry, "@florb{ wooooo, x = {y}, fg =\"sdf13\", z = 123, }") - assertParse(DocumentParser.Impl.anyEntry, - """@florb{ wooooo, - x = {y}, - fg ="sdf13", - z = 123 # asd, - }""") - - assertParse(DocumentParser.Impl.freeComment, "i am the king of the owrld!!") - assertParse(DocumentParser.Impl.freeComment, """i am the king of the - - owrld!!""") - - assertParse(DocumentParser.Impl.WS ~> DocumentParser.Impl.anyEntry, - """ @florb{ wooooo, - x = {y}, - fg ="sdf13", - z = 123 # asd, - }""") - - assertParse((DocumentParser.Impl.WS ~> DocumentParser.Impl.anyEntry).+, - """ @florb{ wooooo, - x = {y}, - fg ="sdf13", - z = 123 # asd, - }""") - - assertParse(DocumentParser.Impl.bibTex, - """ @florb{ wooooo, - x = {y}, - fg ="sdf13", - z = 123 # asd, - }""") - - assertParse(DocumentParser.Impl.bibTex, - """ @florb{ wooooo, - x = {y}, - fg ="sdf13", - z = 123 # asd, - } - - """ - ) - - assertParse(DocumentParser.Impl.bibTex, - """ - Hi, everybody! - - @florb{ wooooo, - x = {y}, - fg ="sdf13", - z = 123 # asd, - } - @florb{ wooooo, - x = {y}, - fg ="sdf13", - z = 123 # asd, - } - @florb{ wooooo, - x = {y}, - fg ="sdf13", - z = 123 # asd, - } - - free comments are coool - @florb{ wooooo, - x = {y}, - fg ="sdf13", - z = 123 # asd, - } - - - """) - - - assertParse(DocumentParser.Impl.bibTex, - """ - @article {mrx05, - auTHor = "Mr. X", - Title = {Something Great}, - publisher = "nob" # "ody", - YEAR = 2005 - } - """) - - assertParse( - DocumentParser.Impl.braceDelimitedNoOuterLiteral, - "{Interannual Variability of planet-encircling dust activity on {M}ars}") - - // this sample is from: http://amath.colorado.edu/documentation/LaTeX/reference/faq/bibstyles.html - val coloradoSample = assertParse(DocumentParser.Impl.bibTex, - """ - -@string{jgr = "J.~Geophys.~Res."} - -@MISC{primes, - author = "Charles Louis Xavier Joseph de la Vall{\'e}e Poussin", - note = "A strong form of the prime number theorem, 19th century", - year = 1879 - } - -@INBOOK{chicago, - title = "The Chicago Manual of Style", - publisher = "University of Chicago Press", - edition = "Thirteenth", - year = 1982, - pages = "400--401", - key = "Chicago" - } - -@BOOK{texbook, - author = "Donald E. Knuth", - title= "The {{\TeX}book}", - publisher = "Addison-Wesley", - year = 1984 - } - -@BOOK{latexbook, - author = "Leslie Lamport", - title = "{\LaTeX \rm:} {A} Document Preparation System", - publisher = "Addison-Wesley", - year = 1986 - } - -@UNPUBLISHED{btxdoc, - author = "Oren Patashnik", - title = "{Using BibTeX}", - note = "Documentation for general BibTeX users", - month = jan, - year = 1988 - } - -@UNPUBLISHED{btxhak, - author = "Oren Patashnik", - title = "Designing BibTeX Styles", - note = "The part of BibTeX's documentation - that's not meant for general users", - month = jan, - year = 1988 - } - -@BOOK{strunk, - author = "Strunk, Jr., William and E. B. White", - title = "The Elements of Style", - publisher = "Macmillan", - edition = "Third", - year = 1979 - } - -@book{vanleunen, - title = "A Handbook for Scholars", - author = "Mary-Claire van Leunen", - publisher = "Knopf", - year = "1979" - } - -@ARTICLE{Zurek:1993, - AUTHOR = {Zurek, R. W. and Martin, L. J.}, - TITLE = {Interannual Variability of planet-encircling dust activity on {M}ars}, - YEAR = {1993}, - JOURNAL = jgr, - VOLUME = {98}, - NUMBER = {E2}, - PAGES = {3247--3259} -} - -@Article{Narendra_1990, - author = {K.S.Narendra and K.S.Parthsarathy}, - title = {Identification and Control of Dynamical System - using Neural Networks}, - journal = "IEENN", - year = {1990}, - volume = {1}, - number = {1}, - month = {}, - pages = {4-27}, - note = {}, - annote = {} -} - - - """) - - assert(coloradoSample.successful, coloradoSample) - Dom.astToDom(AST.Document(coloradoSample.get)) - - assertParseAndDocify(DocumentParser.Impl.bibTex, """ - @InProceedings{dredze-EtAl:2007:EMNLP-CoNLL2007, - author = {Dredze, Mark and Blitzer, John and Pratim Talukdar, Partha and Ganchev, Kuzman and Graca, Jo\~ao and Pereira, Fernando}, - title = {Frustratingly Hard Domain Adaptation for Dependency Parsing}, - booktitle = {Proceedings of the CoNLL Shared Task Session of EMNLP-CoNLL 2007} - pages = {1051--1055}, - url = {http://www.aclweb.org/anthology/D/D07/D07-1112} - } - """) - - assertParseAndDocify(DocumentParser.Impl.bibTex, """ - @InProceedings{BanikACL09-shortpaper, - author = {Eva Banik}, - title = {Extending a Surface Realizer to Generate Coherent Discourse}, - booktitle = {Proceedings of the Short Papers of the Joint conference of the Association for Computational Linguistics and the Asian Federation of Natural Language Processing (ACL-IJCNLP-09), Singapore}, - year = 2009 - } - - @inproceedings{webdb03-smwea, - title={{ODISSEA: A Peer-to-Peer Architecture for Scalable Web Search and Information Retrieval}}, - author={T. Suel and C. Mathur and J. Wu and J. Zhang and A. Delis - and M. Kharrazi and X. Long and K. Shanmugasunderam}, - booktitle={{6th International Workshop on the Web and Databases (WebDB)}}, - month={June}, - year={2003}, - address={San Diego, CA} - } - - @inproceedings{1333582, - author = {Donglai Zhang and Paul Coddington and Andrew Wendelborn}, - title = {Binary Data Transfer Performance over High-Latency Networks Using Web Service Attachments}, - booktitle = {E-SCIENCE '07: Proceedings of the Third IEEE International Conference on e-Science and Grid Computing}, - year = {2007}, - isbn = {0-7695-3064-8}, - pages = {261--269}, - doi = {http://dx.doi.org/10.1109/E-SCIENCE.2007.16}, - publisher = {IEEE Computer Society} - - - } - """) - - assertParseAndDocify(DocumentParser.Impl.bibTex, """ -@inproceedings{nahm:icml-wkshp02, - author = {Un Yong Nahm and Mikhail Bilenko and Raymond J. - Mooney}, - title = {Two Approaches to Handling Noisy Variation in Text - Mining}, - booktitle = {Proceedings of the ICML-2002 Workshop on - Text Learning}, - pages = {18--27}, - year = 2002, -} - } - """) - - assertParseAndDocify(DocumentParser.Impl.bibTex, """ -@article{1814808, - author = {Kauppinen, Tomi and Mantegari, Glauco and Paakkarinen, Panu and Kuittinen, Heini and Hyv\"{o}nen, Eero and Bandini, Stefania}, - title = {Determining relevance of imprecise temporal intervals for cultural heritage information retrieval}, - journal = {Int. J. Hum.-Comput. Stud.}, - volume = {68}, - number = {9}, - year = {2010}, - issn = {1071-5819}, - pages = {549--560}, - doi = {http://dx.doi.org/10.1016/j.ijhcs.2010.03.002}, - publisher = {Academic Press, Inc.}, - address = {Duluth, MN, USA}, - }""") - - assertParseAndDocify(DocumentParser.Impl.bibTex, """ -@inproceedings{sen07:coordinating, - Author = {Sen, Rohan and Hackmann, Gregory and Haitjema, Mart and Roman, Gruia-Catalin and Gill, Christopher}, - Booktitle = {Lecture Notes in Computer Science}, - Pages = {249--267}, - Title = {Coordinating Workflow Allocation and Execution in Mobile Environments}, - Url = {http://dx.doi.org/10.1007/978-3-540-72794-1_14}, - Volume = {4467} - Year = {2007} -}""") - - assertParseAndDocify(DocumentParser.Impl.bibTex, """ - @COMMENT This file was generated by bib2html.pl version 0.94 - @COMMENT written by Patrick Riley - @COMMENT This file came from Evgeniy Gabrilovich's publication pages at - @COMMENT http://www.gabrilovich.com/pubs.html - @Proceedings{Bunescu:2008:WikiAI, - title = "Proceedings of the AAAI Workshop on Wikipedia and Artificial Intelligence: An Evolving Synergy (WikiAI)", - year = 2008, - editor = "Razvan Bunescu and Evgeniy Gabrilovich and Rada Mihalcea", - month = "July", - organization = "Association for the Advancement of Artificial Intelligence", - publisher = "{AAAI} Press", - note = "AAAI Technical Report WS-08-15" - } - """, print = true) - - assertParseAndDocify(DocumentParser.Impl.bibTex, """ - @article{acmalg295, - author="H. Sp\"{a}th", - title="Exponential Curve Fit", - volume=10, - number=2, - pages="87", - year=1967, - month="February", - journal=cacm - } - """, print = true) - - expectResult(NameParser.stringToNames("Graca, Jo\\~ao"))(List(Name("Jo\\~ao", "", "Graca", ""))) - - expectResult(NameParser.stringToNames("Ludwig von Beethoven"))(List(Name("Ludwig", "von", "Beethoven", ""))) - expectResult(NameParser.stringToNames("von Beethoven, Ludwig"))(List(Name("Ludwig", "von", "Beethoven", ""))) - expectResult(NameParser.stringToNames("Jones, Jr., John-Paul"))(List(Name("John Paul", "", "Jones", "Jr."))) - expectResult(NameParser.stringToNames("John Paul Jones"))(List(Name("John Paul", "", "Jones", ""))) - - expectResult(NameParser.stringToNames("John Paul Jones and Jones, John Paul"))( - List(Name("John Paul", "", "Jones", ""), Name("John Paul", "", "Jones", ""))) - expectResult(NameParser.stringToNames("John Paul Jones and Ludwig von Beethoven"))( - List(Name("John Paul", "", "Jones", ""), Name("Ludwig", "von", "Beethoven", ""))) - - expectResult(NameParser.stringToNames("Charles Louis Xavier Joseph de la Vallee Poussin"))( - List(Name("Charles Louis Xavier Joseph", "de la", "Vallee Poussin", ""))) - - expectResult(NameParser.stringToNames("{Barnes} {and} {Noble} {Inc.}"))(List(Name("Barnes", "and", "Noble Inc.", ""))) - - expectResult(NameParser.stringToNames("Ralph Alpher and Bethe, Hans and George Gamow"))( - List(Name("Ralph", "", "Alpher", ""), Name("Hans", "", "Bethe", ""), Name("George", "", "Gamow", ""))) - expectResult(NameParser.stringToNames("K.S.Narendra"))(List(Name("K. S.", "", "Narendra", ""))) - - expectResult(NameParser.stringToNames("{\\e'}cole"))(List(Name("", "", "{\\e'}cole", ""))) - - expectResult(NameParser.stringToNames("John-Paul Jones and Bill Thompson"))( - List(Name("John Paul", "", "Jones", ""), Name("Bill", "", "Thompson", ""))) - - expectResult(NameParser.stringToNames("{\\e'}col{\\e'}"))(List(Name("", "", "{\\e'}col{\\e'}", ""))) - - expectResult(NameParser.stringToNames("{hey ho lotsa stu\\}ff}"))(List(Name("", "", "hey ho lotsa stu\\}ff", ""))) - expectResult(NameParser.stringToNames("{Jean} {de la Fontaine du} {Bois Joli}"))(List(Name("Jean", "de la Fontaine du", "Bois Joli", ""))) - expectResult(NameParser.stringToNames("Jean de la Fontaine du Bois Joli"))(List(Name("Jean", "de la Fontaine du", "Bois Joli", ""))) - - val clx1 = NameParser.stringToNames("Charles Louis Xavier Joseph de la Vall{\\'e}e Poussin").head - expectResult(clx1)(Name("Charles Louis Xavier Joseph", "de la", "Vall{\\'e}e Poussin", "")) - val clx2 = Dom.stringToDom("@thing{asdf, author = \"Charles Louis Xavier Joseph de la Vall{\\'e}e Poussin\"}") - .right.get.entries.head._2.authors.get.head - expectResult(clx2)(Name("Charles Louis Xavier Joseph", "de la", "Vall{\\'e}e Poussin", "")) - val clx3 = Dom.stringToDom("@thing{asdf, author = {Charles Louis Xavier Joseph de la Vall{\\'e}e Poussin}}") - .right.get.entries.head._2.authors.get.head - expectResult(clx3)(Name("Charles Louis Xavier Joseph", "de la", "Vall{\\'e}e Poussin", "")) - - assert(clx1 == clx2 && clx2 == clx3, (clx1, clx2, clx3)) - - val ksn1 = NameParser.stringToNames("K.S.Narendra").head - expectResult(ksn1)(Name("K. S.", "", "Narendra", "")) - val ksn2 = Dom.stringToDom("@thing{asdf, author = \"K.S.Narendra\"}") - .right.get.entries.head._2.authors.get.head - expectResult(ksn2)(Name("K. S.", "", "Narendra", "")) - val ksn3 = Dom.stringToDom("@thing{asdf, author = {K.S.Narendra}}") - .right.get.entries.head._2.authors.get.head - expectResult(ksn3)(Name("K. S.", "", "Narendra", "")) - val ksn4 = Dom.stringToDom("@thing{asdf, author = {K.S.Narendra and Hugh Jass}}") - .right.get.entries.head._2.authors.get.head - expectResult(ksn4)(Name("K. S.", "", "Narendra", "")) - - assert(ksn1 == ksn2 && ksn2 == ksn3 && ksn3 == ksn4, (ksn1, ksn2, ksn3, ksn4)) - - if (false) { - - // didn't check in files for testing since they're pretty big - if interested, go to BibNet or I can provide - - val fileText = scala.io.Source.fromFile("inputs/case-based-reasoning.bib.txt").mkString - val res = Dom.stringToDom(fileText, false) - //println(res) - - def timed[T](showTime: Long => String)(body: => T) = { - val start = System.currentTimeMillis - val result = body - val time = showTime(System.currentTimeMillis - start) - logger.debug(time) - (result, time) - } - - val filePath2 = "inputs/domain-decomp.bib.txt" - val file2 = scala.io.Source.fromFile(filePath2).toArray - val fileText2 = file2.mkString - - val numLines = file2.length - val numMb = new java.io.File(filePath2).length / 1024.0 / 1024.0 - - val (result, time) = - timed(t => - "domain-decomp.bib (%f MB, %d lines) parsed and dom-ified in %d ms (%f MB/sec, %f lines/sec)" format - (numMb, numLines, t, (1000.0 * numMb) / t, (1000.0 * numLines) / t)) { - Dom.stringToDom(fileText2, false) - } - - // println(result) - logger.debug(time) - val sizeMult = 10 - val bigtext = List.range(0, sizeMult).map(_ => fileText2).mkString - val (bigresult, bigtime) = - timed(t => - "%d times domain-decomp.bib (%f MB, %d lines) parsed and dom-ified in %d ms (%f MB/sec, %f lines/sec)" format - (sizeMult, numMb * sizeMult, numLines * sizeMult, t, (1000.0 * numMb * sizeMult) / t, (1000.0 * numLines * sizeMult) / t)) { - Dom.stringToDom(bigtext, false) - } - } - } -} diff --git a/src/test/scala/cc/factorie/app/classify/backend/TestClassification.scala b/src/test/scala/cc/factorie/app/classify/backend/TestClassification.scala deleted file mode 100644 index 95fcb88..0000000 --- a/src/test/scala/cc/factorie/app/classify/backend/TestClassification.scala +++ /dev/null @@ -1,40 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.app.classify.backend - -import cc.factorie.la.{DenseTensor1, DenseTensor2} -import org.junit.Assert._ -import org.junit.Test -import org.scalatest.junit._ - - -class TestLinearMulticlassClassifier extends JUnitSuite with cc.factorie.util.FastLogging { - - @Test - def testLinearMulticlassClassifier(): Unit = { - val labelSize = 2 - val featureSize = 2 - val classifier = new LinearMulticlassClassifier(labelSize, featureSize) - - // assign a weight - val weightTensor = new DenseTensor2(Array(Array(0.2, 0.4), Array(0.8, 0.6))) - classifier.weights.set(weightTensor) - - // feed the classifier a feature - val features = new DenseTensor1(featureSize) - features(0) = 0.1 - features(1) = 0.9 - assertArrayEquals(weightTensor.leftMultiply(features).toArray, classifier.predict(features).toArray, 0.001) - } - -} diff --git a/src/test/scala/cc/factorie/app/classify/backend/TestNaiveBayes.scala b/src/test/scala/cc/factorie/app/classify/backend/TestNaiveBayes.scala deleted file mode 100644 index 86221d0..0000000 --- a/src/test/scala/cc/factorie/app/classify/backend/TestNaiveBayes.scala +++ /dev/null @@ -1,92 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.app.classify.backend - -import cc.factorie.app.classify.NaiveBayesClassifierTrainer -import cc.factorie.la.DenseTensor2 -import cc.factorie.variable._ -import org.junit.Assert._ -import org.junit.Test -import org.scalatest.junit._ - -import scala.collection.mutable - - -class TestNaiveBayes extends JUnitSuite with cc.factorie.util.FastLogging { - - // define the person gender as classification label - object GenderDomain extends CategoricalDomain[String] { - value("male") - value("female") - freeze() - } - - // classification features - object PersonFeaturesDomain extends CategoricalVectorDomain[String] - class PersonFeatures extends BinaryFeatureVectorVariable[String] { - override def domain: CategoricalVectorDomain[String] = PersonFeaturesDomain - } - - // Person can be have features, and a gender label - class Person(gender: String, largeFoot: Boolean, longHair: Boolean) - extends LabeledCategoricalVariable(gender) { - - def domain = GenderDomain - - val features: PersonFeatures = { - val results = new PersonFeatures - if (this.largeFoot) { - results += "LargeFoot" - } - if (this.longHair) { - results += "LongHair" - } - results - } - } - - @Test - def testItemizedModel(): Unit = { - - // Person(gender, largeFoot, longHair) - val p1 = new Person("male", true, false) - val p2 = new Person("male", true, false) - val p3 = new Person("male", true, true) - val p4 = new Person("female", false, true) - val p5 = new Person("female", false, true) - val p6 = new Person("female", false, true) - val p7 = new Person("female", true, true) - - val people = new mutable.ArrayBuffer[Person]() - people ++= Seq(p1, p2, p3, p4, p5, p6, p7) - - // specify 0 to disable smoothing - val trainer = new NaiveBayesClassifierTrainer(0) - - val classifier = trainer.train(people, (person: Person) => person.features) - - // what we expect: - // p(largeFoot|male) = 3/4 - // p(longhair|male) = 1/4 - // p(largeFoot|female) = 1/5 - // p(longhair|female) = 4/5 - val expected = new DenseTensor2(Array(Array(math.log(0.75), math.log(0.2)), Array(math.log(0.25), math.log(0.8)))) - assertArrayEquals(expected.toArray, classifier.weights.value.toArray, 0.001) - - // p(male|largeFoot&longHair) = 0.75 * 0.25 = 0.1875 - // p(female|largeFoot&longHair) = 0.2 * 0.8 = 0.16 - val c = classifier.classify(p7) - assertArrayEquals(Array(math.log(0.1875), math.log(0.16)), c.prediction.toArray, 0.001) - } - -} diff --git a/src/test/scala/cc/factorie/app/mf/TestWSabie.scala b/src/test/scala/cc/factorie/app/mf/TestWSabie.scala deleted file mode 100644 index 8caf705..0000000 --- a/src/test/scala/cc/factorie/app/mf/TestWSabie.scala +++ /dev/null @@ -1,44 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.app.mf - -import cc.factorie._ -import cc.factorie.la._ -import cc.factorie.optimize.OnlineTrainer -import cc.factorie.variable.DiscreteDomain -import org.junit._ - -/** - * User: apassos - * Date: 4/5/13 - * Time: 3:19 PM - */ -class TestWSabie { - @Test def simpleTestWsabie() { - val d = new DiscreteDomain(3) - val m = new WSabie.WSabieModel(d, 5, new java.util.Random(0)) - val q = new SparseBinaryTensor1(3) - q += 0 - val p = new SparseBinaryTensor1(3) - p += 1 - val n = new SparseBinaryTensor1(3) - n += 2 - val e = new WSabie.WSabieExample(m, q, p, n) - val trainer = new OnlineTrainer(m.parameters) - trainer.optimizer.initializeWeights(m.parameters) - while (!trainer.isConverged) { - trainer.processExamples(Seq(e)) - } - assert(m.score(q, p) > m.score(q, n)) - } -} diff --git a/src/test/scala/cc/factorie/app/nlp/TestCompoundDocumentAnnotator.scala b/src/test/scala/cc/factorie/app/nlp/TestCompoundDocumentAnnotator.scala deleted file mode 100644 index 8ed2608..0000000 --- a/src/test/scala/cc/factorie/app/nlp/TestCompoundDocumentAnnotator.scala +++ /dev/null @@ -1,38 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.app.nlp - -import cc.factorie.app.nlp.segment.{DeterministicNormalizingTokenizer, DeterministicSentenceSegmenter} -import org.scalatest.{FlatSpec, Matchers} - -/** - * @author John Sullivan - */ -class TestCompoundDocumentAnnotator extends FlatSpec with Matchers { - def fix = new { - val doc = new Document("Better to sleep with a sober cannibal than a drunken Christian.") - } - - "CompoundDocumentAnnotator" should "work properly" in { - val f = fix - import f._ - - val annos = Seq(DeterministicNormalizingTokenizer, DeterministicSentenceSegmenter) - - val compAnno = new CompoundDocumentAnnotator(annos) - compAnno process doc - - assert(doc.annotators.keySet contains classOf[Token]) - assert(doc.annotators.keySet contains classOf[Sentence]) - } -} diff --git a/src/test/scala/cc/factorie/app/nlp/TestDocumentAnnotatorMap.scala b/src/test/scala/cc/factorie/app/nlp/TestDocumentAnnotatorMap.scala deleted file mode 100644 index ab5d7ee..0000000 --- a/src/test/scala/cc/factorie/app/nlp/TestDocumentAnnotatorMap.scala +++ /dev/null @@ -1,103 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.app.nlp -import cc.factorie.app.nlp.coref.{Mention, MentionList} -import cc.factorie.app.nlp.lemma.WordNetTokenLemma -import cc.factorie.app.nlp.ner.{BilouConllNerTag, BilouOntonotesNerTag} -import cc.factorie.app.nlp.parse.ParseTree -import cc.factorie.app.nlp.phrase.NounPhraseType -import cc.factorie.app.nlp.pos.PennPosTag -import org.junit.Test - -/** - * User: apassos - * Date: 7/18/13 - * Time: 9:06 AM - */ -class TestDocumentAnnotatorMap { - @Test def testDefaultPipelines() { - // this map mirrors the default one without loading the models themselves. There should - // be a less awkward way of doing this - val map = new MutableDocumentAnnotatorMap - object pos1 extends DocumentAnnotator { - def prereqAttrs: Iterable[Class[_]] = List(classOf[Sentence], classOf[segment.PlainNormalizedTokenString]) - def postAttrs: Iterable[Class[_]] = List(classOf[PennPosTag]) - def process(document: Document) = document - def tokenAnnotationString(token: Token) = "" - } - map += pos1 - object parser1 extends DocumentAnnotator { - def prereqAttrs = Seq(classOf[Sentence], classOf[PennPosTag], classOf[lemma.WordNetTokenLemma]) // Sentence also includes Token - def postAttrs = Seq(classOf[ParseTree]) - def process(d: Document) = d - def tokenAnnotationString(t: Token) = "" - } - map += parser1 - map += segment.PlainTokenNormalizer - map += cc.factorie.app.nlp.segment.DeterministicNormalizingTokenizer - map += cc.factorie.app.nlp.segment.DeterministicSentenceSegmenter - object wnLemma extends DocumentAnnotator { - def prereqAttrs: Iterable[Class[_]] = List(classOf[PennPosTag]) - def postAttrs: Iterable[Class[_]] = List(classOf[WordNetTokenLemma]) - def process(d: Document) = d - def tokenAnnotationString(t: Token) = "" - } - map += wnLemma - map += lemma.SimplifyDigitsLemmatizer - map += lemma.CollapseDigitsLemmatizer - map += lemma.PorterLemmatizer - map += lemma.LowercaseLemmatizer - object ner1 extends DocumentAnnotator { - def tokenAnnotationString(token:Token): String = token.attr[BilouConllNerTag].categoryValue - def prereqAttrs: Iterable[Class[_]] = List(classOf[Sentence]) - def postAttrs: Iterable[Class[_]] = List(classOf[BilouConllNerTag]) - def process(d: Document) = d - } - map += ner1 - object ner2 extends DocumentAnnotator { - override def tokenAnnotationString(token:Token): String = token.attr[BilouOntonotesNerTag].categoryValue - def prereqAttrs: Iterable[Class[_]] = List(classOf[Token]) - def postAttrs: Iterable[Class[_]] = List(classOf[BilouOntonotesNerTag]) - def process(document:Document): Document = document - } - map += ner2 - object parseBasedMentionFinding extends DocumentAnnotator { - def prereqAttrs: Iterable[Class[_]] = Seq(classOf[parse.ParseTree]) - def postAttrs: Iterable[Class[_]] = Seq(classOf[MentionList]) - override def tokenAnnotationString(token:Token): String = token.document.attr[MentionList].filter(mention => mention.phrase.contains(token)) match { case ms:Seq[Mention] if ms.length > 0 => ms.map(m => m.phrase.attr[NounPhraseType].categoryValue+":"+m.phrase.indexOf(token)).mkString(","); case _ => "_" } - def process(d: Document) = d - } - map += parseBasedMentionFinding -// object coref1 extends DocumentAnnotator { -// def tokenAnnotationString(token: Token) = "" -// def prereqAttrs = Seq(classOf[MentionList], classOf[OntonotesPhraseEntityType], classOf[PhraseGender], classOf[PhraseNumber]) -// def postAttrs = Seq(classOf[GenericEntityMap[Mention]]) -// def process(document: Document) = document -// } -// map += coref1 -// map += MentionPhraseGenderLabeler -// map += MentionPhraseNumberLabeler -// object mentionEntityType extends DocumentAnnotator { -// def tokenAnnotationString(token:Token): String = { val mentions = token.document.attr[MentionList].filter(_.phrase.contains(token)); mentions.map(_.phrase.attr[OntonotesPhraseEntityType].categoryValue).mkString(",") } -// def prereqAttrs: Iterable[Class[_]] = List(classOf[MentionList]) -// def postAttrs: Iterable[Class[_]] = List(classOf[OntonotesPhraseEntityType]) -// def process(d: Document) = d -// } -// map += mentionEntityType - for (key <- map.keys) { - DocumentAnnotatorPipeline(map.toMap, Nil, Seq(key)) - // println(s"Pipeline for $key is ${pipeline.mkString(" ")}") - } - DocumentAnnotatorPipeline(map.toMap, Nil, map.keys.toSeq) - } -} diff --git a/src/test/scala/cc/factorie/app/nlp/TestDocumentStore.scala b/src/test/scala/cc/factorie/app/nlp/TestDocumentStore.scala deleted file mode 100644 index 35ab15d..0000000 --- a/src/test/scala/cc/factorie/app/nlp/TestDocumentStore.scala +++ /dev/null @@ -1,55 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.app.nlp - -import cc.factorie.app.nlp.parse._ -import cc.factorie.app.nlp.pos.{PennPosDomain, PennPosTag} -import org.scalatest._ - -/** Test serialization of Document to BSON. - @author John Sullivan, Andrew McCallum - */ -class TestDocumentStore extends FlatSpec with Matchers { - - def fix = new { - val doc1 = new Document("If it's your job to eat a frog, it's best to do it first thing in the morning. And If it's your job to eat two frogs, it's best to eat the biggest one first.") - DocumentAnnotatorPipeline(segment.DeterministicNormalizingTokenizer, segment.DeterministicSentenceSegmenter).process(doc1) - for (token <- doc1.tokens) token.attr += new PennPosTag(token, token.positionInSentence % PennPosDomain.size) - for (sentence <- doc1.sentences) sentence.attr += new ParseTree(sentence, Range(0, sentence.length).toArray, Range(0, sentence.length).map(_ % ParseTreeLabelDomain.length).toArray) - doc1.annotators(classOf[PennPosTag]) = this.getClass - doc1.annotators(classOf[ParseTree]) = this.getClass - } - - "DocumentCubbie" should "serialize and deserialize properly" in { - val f = fix - import f._ - - val cubbie = new StandardDocumentCubbie() := doc1 - val doc2 = cubbie.document - - assert(doc1.tokens.toSeq.map(_.string) == doc2.tokens.toSeq.map(_.string)) - assert(doc1.tokens.toSeq.map(_.posTag.categoryValue) == doc2.tokens.toSeq.map(_.posTag.categoryValue)) - } -/* - it should "preserve document annotation metadata" in { - val f = fix - import f._ - - val cubbie = new StandardDocumentCubbie() := doc1 - val doc2 = cubbie.document - - assert(doc1.annotators.keySet == doc2.annotators.keySet) - - } -*/ -} diff --git a/src/test/scala/cc/factorie/app/nlp/TokenSpanTests.scala b/src/test/scala/cc/factorie/app/nlp/TokenSpanTests.scala deleted file mode 100644 index 3f2378c..0000000 --- a/src/test/scala/cc/factorie/app/nlp/TokenSpanTests.scala +++ /dev/null @@ -1,40 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.app.nlp - -import org.scalatest._ - -/** - * @author John Sullivan - */ -class TokenSpanTests extends FlatSpec with Matchers { - - def fixture = new { - val doc = new Document() - "Jaques and Jill went up the hill to fetch a pail of water .".split(' ').foreach { s => - new Token(doc, s) - } - - val span = new TokenSpan(doc.asSection, 5, 2) - - } - - - "TokenSpan" should "calculate context windows properly" in { - val f = fixture - import f._ - assert(Seq("went", "up", "to", "fetch") == span.contextWindow(2).map(_.string)) - assert(Seq.empty[String] == span.contextWindow(0)) - assert("Jaques and Jill went up to fetch a pail of water .".split(' ').toSeq == span.contextWindow(10).map(_.string)) - } -} diff --git a/src/test/scala/cc/factorie/app/nlp/TokenTests.scala b/src/test/scala/cc/factorie/app/nlp/TokenTests.scala deleted file mode 100644 index 0d90f08..0000000 --- a/src/test/scala/cc/factorie/app/nlp/TokenTests.scala +++ /dev/null @@ -1,37 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.app.nlp - -import org.scalatest.{FlatSpec, Matchers} - -/** - * @author John Sullivan - */ -class TokenTests extends FlatSpec with Matchers { - - def fixture = new { - val doc = new Document() - "Jaques and Jill went up the hill to fetch a pail of water .".split(' ').foreach { s => - new Token(doc, s) - } - - val span = new TokenSpan(doc.asSection, 5, 2) - } - - "Token" should "calculate context bags properly" in { - val f = fixture - import f._ - assert(doc.tokens.toSeq(2).string == "Jill") - assert(doc.tokens.toSeq(2).contextBag(3).map(_.string).toSet == "Jaques and went up the".split(" ").toSet) - } -} diff --git a/src/test/scala/cc/factorie/app/nlp/lexicon/TestLexicon.scala b/src/test/scala/cc/factorie/app/nlp/lexicon/TestLexicon.scala deleted file mode 100644 index d01c9b9..0000000 --- a/src/test/scala/cc/factorie/app/nlp/lexicon/TestLexicon.scala +++ /dev/null @@ -1,150 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.app.nlp.lexicon - -import cc.factorie.app.nlp._ -import org.junit.Test -import org.scalatest.junit.JUnitSuite - -class TestLexicon extends JUnitSuite { - - @Test def testOldVsNew(): Unit = { - val newlex = new PhraseLexicon("new") - val oldlex = new ChainPhraseLexicon("old") - val words = List("New York", "Paris", "San Franciscon") - words.foreach(word => { - newlex += word - oldlex += word - }) - - assert(newlex.contains("Paris") && oldlex.contains("Paris")) - assert(newlex.contains("New York") && oldlex.contains("New York")) - - val string = "Yesterday I flew from Paris to New York." - val doc = DocumentAnnotatorPipeline(segment.DeterministicNormalizingTokenizer).process(new Document(string)) - val section = doc.asSection - val tok = section.tokens(4) - assert(tok.string == "Paris") - assert(newlex.contains(tok) && oldlex.contains(tok)) - - assert(section.tokens(6).string == "New") - val toks = List(section.tokens(6), section.tokens(7)) - assert(newlex.contains(toks)) - assert(oldlex.contains(toks)) - - /* the "startsAt" tests fail due to an Error thrown by ChainPhraseLexicon.startsAt */ -// assert(newlex.startsAt(tok) == oldlex.startsAt(tok)) - - } - -// problems loading these due to classpath issues -KS -// @Test def testLexiconContent(): Unit = { -// val st = "accessor fds inc balanced allocation fd" -// val newlex = lexicon.iesl.Company -// val oldlex = lexicon.iesl.TestCompany -// assert (newlex.contains(st) && oldlex.contains(st)) -// } - - @Test def testLexiconSingleWords(): Unit = { - val lexicon = new PhraseLexicon("test1") - lexicon += "one" - lexicon += "two" - lexicon += "three" - assert(lexicon.contains("one")) - assert(lexicon.contains("three")) - assert(!lexicon.contains("four")) - } - - @Test def testLexiconPhrases(): Unit = { - val lexicon = new PhraseLexicon("test2") - lexicon += "Paris" - assert(lexicon.contains("Paris")) - lexicon += "San Fransisco" - assert(lexicon.contains("Paris")) - lexicon += "San Diego" - lexicon += "New York" - lexicon += "New Hampshire" - lexicon += "Oklahoma City" - lexicon += "London" - assert(lexicon.contains("Paris")) - assert(lexicon.contains("New York")) - assert(lexicon.contains("New Hampshire")) - assert(lexicon.containsWords(List("New", "York"))) - assert(lexicon.containsWords(List("New", "Hampshire"))) - assert(!lexicon.containsWords(List("New", "Brunswick"))) - assert(!lexicon.contains("England")) - assert(!lexicon.containsWord("England")) - - val string = "Yesterday I flew from Paris to New York." - val doc = DocumentAnnotatorPipeline(segment.DeterministicNormalizingTokenizer).process(new Document(string)) - val section = doc.asSection - assert(section.tokens(4).string == "Paris") - assert(lexicon.contains(section.tokens(4))) - val toks: Seq[Token] = List(section.tokens(6), section.tokens(7)).toSeq - assert(section.tokens(6).string == "New") - assert(section.tokens(7).string == "York") - assert(lexicon.contains(toks)) - /* these won't pass using HashyLexicon, if you need these to pass use PhraseLexicon instead */ - // assert(section.tokens(7).string == "York") - // assert(lexicon.contains(section.tokens(7))) - // assert(lexicon.contains(section.tokens(6))) - - assert(section.tokens(5).string == "to") - assert(!lexicon.contains(section.tokens(5))) - } - - - @Test def testChainLexiconSingleWords(): Unit = { - val lexicon = new ChainPhraseLexicon("test1") - lexicon += "one" - lexicon += "two" - lexicon += "three" - assert(lexicon.contains("one")) - assert(lexicon.contains("three")) - assert(!lexicon.contains("four")) - } - - @Test def testChainLexiconPhrases(): Unit = { - val lexicon = new ChainPhraseLexicon("test2") - lexicon += "Paris" - assert(lexicon.contains("Paris")) - lexicon += "San Fransisco" - assert(lexicon.contains("Paris")) - lexicon += "San Diego" - lexicon += "New York" - lexicon += "Oklahoma City" - lexicon += "London" - assert(lexicon.contains("Paris")) - assert(lexicon.contains("Paris")) - assert(lexicon.contains("New York")) - assert(lexicon.containsWords(List("New", "York"))) - assert(!lexicon.containsWords(List("New", "Hampshire"))) - assert(!lexicon.contains("England")) - assert(!lexicon.containsWord("England")) - - val string = "Yesterday I flew from Paris to New York." - val doc = DocumentAnnotatorPipeline(segment.DeterministicNormalizingTokenizer).process(new Document(string)) - val section = doc.asSection - assert(section.tokens(4).string == "Paris") - assert(lexicon.contains(section.tokens(4))) - assert(section.tokens(7).string == "York") - assert(lexicon.contains(section.tokens(7))) - assert(lexicon.contains(section.tokens(6))) - assert(section.tokens(5).string == "to") - assert(!lexicon.contains(section.tokens(5))) - - //println(lexicon.phrases) - assert(lexicon.phrases.contains("new york")) - } - -} diff --git a/src/test/scala/cc/factorie/app/nlp/lexicon/TestTriePhraseLexicon.scala b/src/test/scala/cc/factorie/app/nlp/lexicon/TestTriePhraseLexicon.scala deleted file mode 100644 index f55feeb..0000000 --- a/src/test/scala/cc/factorie/app/nlp/lexicon/TestTriePhraseLexicon.scala +++ /dev/null @@ -1,51 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.app.nlp.lexicon - -import org.junit.Test -import org.scalatest.junit.JUnitSuite - -/** - * @author Kate Silverstein - * created on 1/12/15 - */ - -class TestTriePhraseLexicon extends JUnitSuite{ - val phrase = "the quick brown fox jumped over the lazy dog" - - @Test - def testContainsLemmatizedWord(): Unit = { - val lexicon = new TriePhraseLexicon("test") - phrase.split(" ").foreach(lexicon += _) - assert(lexicon.containsLemmatizedWord("fox")) - } - - @Test - def testContainsLemmatizedWords(): Unit = { - val lexicon = new TriePhraseLexicon("test") - lexicon += phrase - assert(lexicon.containsLemmatizedWords(phrase.split(" "))) - } - - @Test - def testMultiword(): Unit = { - val words = List("the", "quick brown", "fox") - val lexicon = new TriePhraseLexicon("test") - words.foreach(lexicon += _) - assert(lexicon.containsLemmatizedWord("fox")) - assert(lexicon.containsLemmatizedWords(Seq("quick", "brown"))) - } - -} - - diff --git a/src/test/scala/cc/factorie/app/nlp/ner/TestNerTaggers.scala b/src/test/scala/cc/factorie/app/nlp/ner/TestNerTaggers.scala deleted file mode 100644 index 61fae27..0000000 --- a/src/test/scala/cc/factorie/app/nlp/ner/TestNerTaggers.scala +++ /dev/null @@ -1,45 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.app.nlp.ner - -/** - * @author Kate Silverstein - * created on 3/23/15 - */ - -import cc.factorie.app.nlp.load._ -import org.scalatest._ - -class TestNerTaggers extends FlatSpec { - val conllTestFilename = this.getClass.getResource("/conll-ner-input").getPath - val ontoTestFilename = this.getClass.getResource("/parser-test-input").getPath - "LoadConll2003" should "load 2 documents" in { - val testDocs = LoadConll2003.fromFilename(conllTestFilename) - println(testDocs.length) - testDocs.foreach(d => println(d.sections.flatMap(_.tokens).mkString(","))) - assert(testDocs.length == 2, "failed to load documents") - assert(testDocs.head.tokenCount > 0, "failed to load document with tokens") - assert(testDocs.head.sections.flatMap(_.tokens).forall(t => t.attr.contains(classOf[LabeledBioConllNerTag])), "token with no LabeledIobConllNerTag") - val bilouTestDocs = LoadConll2003(BILOU=true).fromFilename(conllTestFilename) - assert(bilouTestDocs.length == 2, "failed to load documents") - assert(bilouTestDocs.head.tokenCount > 0, "failed to load document with tokens") - assert(bilouTestDocs.head.sections.flatMap(_.tokens).forall(t => t.attr.contains(classOf[LabeledBilouConllNerTag])), "token with no LabeledBilouConllNerTag") - } - "LoadOntonotes5" should "load 1 document" in { - val testDocs = LoadOntonotes5.fromFilename(ontoTestFilename) - assert(testDocs.length == 1, "failed to load documents") - assert(testDocs.head.tokenCount > 0, "failed to load document with tokens") - assert(testDocs.head.sections.flatMap(_.tokens).forall(t => t.attr.contains(classOf[LabeledBilouOntonotesNerTag])), "token with no LabeledBilouOntonotesNerTag") - } - // TODO add an actual test for training/testing ChainNer, but without loading all of the lexicons (since this takes awhile) -ks -} diff --git a/src/test/scala/cc/factorie/app/nlp/parse/TestCollapsedParseTree.scala b/src/test/scala/cc/factorie/app/nlp/parse/TestCollapsedParseTree.scala deleted file mode 100644 index 204f5f3..0000000 --- a/src/test/scala/cc/factorie/app/nlp/parse/TestCollapsedParseTree.scala +++ /dev/null @@ -1,37 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.app.nlp.parse - -import cc.factorie.app.nlp.load.LoadOntonotes5 -import org.junit.{Assert, Test} - -/** - * Created by diwe01 on 17.06.14. - */ -class TestCollapsedParseTree { - - val testFileName = this.getClass.getResource("/parser-test-input").getPath() - - @Test - def testCollapsing() = { - val testDoc = LoadOntonotes5.fromFilename(testFileName).head - val testSentence = testDoc.sentences.tail.head - - val tree = ParseTree2.collapsedFromParseTree(testSentence.parse) - Assert.assertNotNull(tree) - Assert.assertNotNull(tree.toString) - Assert.assertEquals(tree.labels.length, tree.parents.length) - Assert.assertEquals(tree.labels.length, tree.vertices.length) - } - -} diff --git a/src/test/scala/cc/factorie/app/nlp/parse/TestTransitionBasedParser.scala b/src/test/scala/cc/factorie/app/nlp/parse/TestTransitionBasedParser.scala deleted file mode 100644 index 357ed60..0000000 --- a/src/test/scala/cc/factorie/app/nlp/parse/TestTransitionBasedParser.scala +++ /dev/null @@ -1,177 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.app.nlp.parse - -import cc.factorie.app.nlp.load._ -import org.junit.{Assert, Before, Test} - -class TestTransitionBasedParser { - - val testFileName = this.getClass.getResource("/parser-test-input").getPath() - var parser: TransitionBasedParser = _ - - @Before - def setUp() = { - parser = new TransitionBasedParser() - } - - @Test - def testDepToken() = { - - val numThreads = 1 - - /* This file contains just one sentence right now */ - val testDoc = LoadOntonotes5.fromFilename(testFileName).head - val testSentences = testDoc.sentences - - val parseDecisions = parser.generateDecisions(testSentences, ParserConstants.TRAINING, numThreads) - - /* Check that the relations between tokens are correct */ - parseDecisions.map(_.last).zip(testSentences).foreach(ds => { - val parseTree = ds._2.attr[ParseTree] - println(s"Sentence: ${ds._2.tokens.map(_.string).mkString(" ")}") - val state = ds._1.state - val sentence = state.sentence - val tokens = state.sentence._tokens - val heads = state.headIndices - val labels = state.arcLabels - - tokens.zip(1 until tokens.length).foreach { case (tok, idx) => { - val parseTreeIdx = idx - 1 - val thisHead = if (heads(idx) != -1) sentence(heads(idx)) else null - val trueHead = parseTree.parent(parseTreeIdx) - if (trueHead == null || thisHead == null) { - // if one has no head then neither should, and this should be the root - if (thisHead != null) { - Assert.assertEquals(s"Head of token ``${tok.string}'' incorrect.", ParserConstants.ROOT_STRING, thisHead.string) - Assert.assertEquals(s"Label of token ``${tok.string}'' incorrect.", "root", labels(idx)) - } else { - Assert.assertNotNull(s"Head of token ``${tok.string}'' incorrect.", thisHead) - } - } else { - // should be the same word - Assert.assertEquals(s"Head of token ``${tok.string}'' incorrect.", trueHead.string, thisHead.string) - - // labels should be the same - Assert.assertEquals(s"Label of token ``${tok.string}'' incorrect.", parseTree.label(parseTreeIdx).categoryValue, labels(idx)) - - // leftmost dependents should be the same - val thisLeftmostDep = sentence(state.leftmostDependent(idx)) - val trueLeftmostDep = if (!parseTree.leftChildren(parseTreeIdx).isEmpty) parseTree.leftChildren(parseTreeIdx).head else null - if (thisLeftmostDep == null || trueLeftmostDep == null) { - // if one is null then they both should be - if (thisLeftmostDep != null) - Assert.assertEquals(s"Leftmost dependency of token ``${tok.string}'' incorrect.", ParserConstants.NULL_STRING, thisLeftmostDep.string) - else - Assert.assertNotNull(s"Leftmost dependency of token ``${tok.string}'' incorrect.", thisLeftmostDep) - } else { - // should be the same word - Assert.assertEquals(s"Leftmost dependency of token ``${tok.string}'' incorrect.", trueLeftmostDep.string, thisLeftmostDep.string) - - // 2nd leftmost dependents should be the same - val thisLeftmostDep2 = sentence(state.leftmostDependent2(idx)) - val trueLeftmostDep2 = if (!trueLeftmostDep.parseLeftChildren.isEmpty) trueLeftmostDep.parseLeftChildren.head else null - if (thisLeftmostDep2 == null || trueLeftmostDep2 == null) { - // if one is null then they both should be - if (thisLeftmostDep != null) - Assert.assertEquals(s"2nd leftmost dependency of token ``${tok.string}'' incorrect.", ParserConstants.NULL_STRING, thisLeftmostDep2.string) - else - Assert.assertNotNull(s"2nd leftmost dependency of token ``${tok.string}'' incorrect.", thisLeftmostDep2) - } else { - // should be same word - Assert.assertEquals(s"2nd leftmost dependency of token ``${tok.string}'' incorrect.", trueLeftmostDep2.string, thisLeftmostDep2.string) - } - } - - // rightmost dependents should be the same - val thisRightmostDep = sentence(state.rightmostDependent(idx)) - val trueRightmostDep = if (!parseTree.rightChildren(parseTreeIdx).isEmpty) parseTree.rightChildren(parseTreeIdx).last else null - - if (thisRightmostDep == null || trueRightmostDep == null) { - // if one is null then they both should be - if (thisRightmostDep != null) - Assert.assertEquals(s"Rightmost dependency of token ``${tok.string}'' incorrect.", ParserConstants.NULL_STRING, thisRightmostDep.string) - else - Assert.assertNotNull(s"Rightmost dependency of token ``${tok.string}'' incorrect.", thisRightmostDep) - } else { - // should be the same word - Assert.assertEquals(s"Rightmost dependency of token ``${tok.string}'' incorrect.", trueRightmostDep.string, thisRightmostDep.string) - - // 2nd leftmost dependents should be the same - val thisRightmostDep2 = sentence(state.rightmostDependent2(idx)) - val trueRightmostDep2 = if (!trueRightmostDep.parseRightChildren.isEmpty) trueRightmostDep.parseRightChildren.last else null - if (thisRightmostDep2 == null || trueRightmostDep2 == null) { - // if one is null then they both should be - if (thisRightmostDep2 != null) - Assert.assertEquals(s"2nd rightmost dependency of token ``${tok.string}'' incorrect.", ParserConstants.NULL_STRING, thisRightmostDep2.string) - else - Assert.assertNotNull(s"2nd rightmost dependency of token ``${tok.string}'' incorrect.", thisRightmostDep2) - } else { - // should be same word - Assert.assertEquals(s"2nd rightmost dependency of token ``${tok.string}'' incorrect.", trueRightmostDep2.string, thisRightmostDep2.string) - } - } - - // left-nearest siblings should be the same - val thisLeftNearestSib = sentence(state.leftNearestSibling(idx)) - val trueParentIdx = parseTree.sentence(parseTreeIdx).parseParentIndex - val trueLeftNearestSib = { - var i = parseTreeIdx - 1 - while (i >= 0 && parseTree.sentence(i).parseParentIndex != trueParentIdx) i -= 1 - if (i == -1) null else parseTree.sentence(i) - } - - if (trueLeftNearestSib == null || thisLeftNearestSib == null) { - // if one is null then they both should be - if (thisLeftNearestSib != null) - Assert.assertEquals(s"Left nearest sibling of token ``${tok.string}'' incorrect.", ParserConstants.NULL_STRING, thisLeftNearestSib.string) - else - Assert.assertNotNull(s"Left nearest sibling of token ``${tok.string}'' incorrect.", thisLeftNearestSib) - } else { - // should be same word - Assert.assertEquals(s"Left nearest sibling of token ``${tok.string}'' incorrect.", trueLeftNearestSib.string, thisLeftNearestSib.string) - } - - // right-nearest siblings should be the same - val thisRightNearestSib = sentence(state.rightNearestSibling(idx)) - val trueRightNearestSib = { - var i = parseTreeIdx + 1 - while (i < parseTree.sentence.size && parseTree.sentence(i).parseParentIndex != trueParentIdx) i += 1 - if (i == parseTree.sentence.size) null else parseTree.sentence(i) - } - - if (trueRightNearestSib == null || thisRightNearestSib == null) { - // if one is null then they both should be - if (thisRightNearestSib != null) - Assert.assertEquals(s"Right nearest sibling of token ``${tok.string}'' incorrect.", ParserConstants.NULL_STRING, thisRightNearestSib.string) - else - Assert.assertNotNull(s"Right nearest sibling of token ``${tok.string}'' incorrect.", thisRightNearestSib) - } else { - // should be same word - Assert.assertEquals(s"Right nearest sibling of token ``${tok.string}'' incorrect.", trueRightNearestSib.string, thisRightNearestSib.string) - } - } - }} - }) - /* Print out the features for the first sentence */ - parseDecisions.head.foreach(decision => { - print(s"${ - // convert decision to a nice verbose string (rather than ints) - val transition = decision.categoryValue.split(" ") - transition.take(2).map(x => ParserConstants(x.toInt)).mkString(" ") + " " + transition(2) - }; ") - println(s"feats: ${decision.features.activeCategories.mkString(", ")}")//domain.dimensionDomain.categories.zip(decision.features.value.toSeq).filter(_._2 == 1.0).map(_._1).mkString(" ")}") - println() - }) - } -} \ No newline at end of file diff --git a/src/test/scala/cc/factorie/app/nlp/segment/TestBigramStatistics.scala b/src/test/scala/cc/factorie/app/nlp/segment/TestBigramStatistics.scala deleted file mode 100644 index f8cbeb9..0000000 --- a/src/test/scala/cc/factorie/app/nlp/segment/TestBigramStatistics.scala +++ /dev/null @@ -1,31 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.app.nlp.segment -import org.junit.Test - -/** - * User: apassos - * Date: 8/19/13 - * Time: 2:24 PM - */ -class TestBigramStatistics { - @Test def testBigramStatistics() { - val gpl = new cc.factorie.app.nlp.Document(cc.factorie.tutorial.WordSegmenter.data.mkString("\n")) - DeterministicNormalizingTokenizer.process(gpl) - val bg = new BigramStatistics - bg.process(gpl) - val phrases = bg.getLikelyPhrases(5, 40) - assert(phrases.exists(p => p(0) == "free" && p.length > 1 && p(1) == "software")) - assert(!phrases.exists(p => p(0) == "you" && p.length > 1 && p(1) == "may")) - } -} diff --git a/src/test/scala/cc/factorie/app/nlp/segment/TestLexerTokenizer.scala b/src/test/scala/cc/factorie/app/nlp/segment/TestLexerTokenizer.scala deleted file mode 100644 index d31012e..0000000 --- a/src/test/scala/cc/factorie/app/nlp/segment/TestLexerTokenizer.scala +++ /dev/null @@ -1,364 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -package cc.factorie.app.nlp.segment - -import cc.factorie.app.nlp.{Document, DocumentAnnotatorPipeline, Sentence, Token} -import cc.factorie.util.FastLogging -import org.junit.Test -import org.scalatest.junit.JUnitSuite - -class TestLexerTokenizer extends JUnitSuite with FastLogging { - - def assertEquals(a: Any, b: Any): Unit = assert(a == b, "\"%s\" did not equal \"%s\"" format (a, b)) - - def checkDeterministicLexerTokenizer(src: String, trg: String): Unit = { - val tokens = runTokenizer(DeterministicTokenizer, src) - for (t <- tokens) { - assertEquals(t.string, src.substring(t.stringStart, t.stringEnd)) - } - assertEquals("[" + tokens.map(_.string).mkString(", ") + "]", trg) - } - - def checkDeterministicNormalizingTokenizer(tokenizer: DeterministicLexerTokenizer, src: String, trg: String): Unit = { - val tokens = runTokenizer(tokenizer, src) - for (t <- tokens) { - assertEquals(t.document.string.substring(t.stringStart, t.stringEnd), src.substring(t.stringStart, t.stringEnd)) - } - assertEquals("[" + tokens.map(_.string).mkString(", ") + "]", trg) - } - - @Test def testSentenceSegmenter(): Unit = { - val text = - """ - I got bored of the iPhone. This has been a conundrum for for the past few months. I wanted a new phone, but didn't know how to transcend away from the iOS environment. I woke up one morning, and said, "whatever, I don't care anymore," and walked into BestBuy(save the hate, I had a $25 coupon and $35 gift card) and bought the Note 2. I have been wanting this phone since I first hear about it. - - Why I made this decision. I love tech stuff, and the iphone, to me, was getting boring. My early upgrade just came through, so I could test a new phone, and still be within the launch period of the next gen iPhone. Having gone from the iPhone 3G, to Iphone4, to the 4S, you would think I would be drawn to the iPhone5 right? No. It did nothing for me. I don't even know how to explain it. - - These are some things that worried me about switching. I have a Mac, and though I use Windows7 on BC, I still want my phone to sync natively to my Mac. The worry that syncing wouldn't be as smooth as the iPhone had me. I don't think there's another phone in existence that syncs info as well as the iPhone on an Mac. I had gotten used to easily syncing EVERYTHING in one go with Itunes and iPhoto, but I decided to just go with it and use it as an experience. - - Now that I actually own the Note 2, and more specifically an Android phone, I actually have a better understanding of the OS quality provided by Apple. - However, with that said, the Android 4.1 is awesome. Better than anything to come before it from Android(obviously, right?). This phone is an absolute MONSTER! - - I now use my iphone as an alarm clock and is the bluetooth source to play music in my car. - """.stripMargin - val d = new Document((1 to 2).map(_ => text).mkString("\n")) - DocumentAnnotatorPipeline(DeterministicNormalizingTokenizer, DeterministicSentenceSegmenter).process(d) - assert(d.sentences.size > 0) - d.sentences.map(_.string).foreach(s => logger.debug(s.toString)) - } - - @Test def testSentenceSegmenterWithOneSentence() { - val text = "The quick brown fox jumps over the lazy dog." - val d = new Document(text) - DocumentAnnotatorPipeline(DeterministicNormalizingTokenizer, DeterministicSentenceSegmenter).process(d) - assert(d.sentences.size == 1) - assert(d.tokens.size == 10) - } - - private def runTokenizer(tokenizer: DeterministicLexerTokenizer, src: String): Iterable[Token] = { - val d = new Document(src) - DocumentAnnotatorPipeline(tokenizer).process(d).tokens - } - - @Test def testDeterministicLexerTokenizer(): Unit = { - - // spaces - checkDeterministicLexerTokenizer( - src = "a b c\n d \t\n\r\fe", - trg = "[a, b, c, d, e]") - - // emoticons - checkDeterministicLexerTokenizer( - src = ":-)))) :----) :( :-) :--)", - trg = "[:-)))), :----), :(, :-), :--)]") - - // URLs - checkDeterministicLexerTokenizer( - src = "|http://www.google.com|www.google.com|mailto:somebody@google.com|some-body@google+.com|", - trg = "[|, http://www.google.com, |, www.google.com, |, mailto:somebody@google.com, |, some-body@google+.com, |]") - - checkDeterministicLexerTokenizer( - src = "google.com index.html a.b.htm ab-cd.shtml", - trg = "[google.com, index.html, a.b.htm, ab-cd.shtml]") - - // abbreviations - checkDeterministicLexerTokenizer( - src = "prof. ph.d. a. a.b. a.b a.b.c. men.cd ab.cd", - trg = "[prof., ph.d., a., a.b., a.b, a.b.c., men, ., cd, ab, ., cd]") - - // consecutive punctuation - checkDeterministicLexerTokenizer( - src = "A..B!!C??D.!?E.!?.!?F..!!??", - trg = "[A, .., B, !!, C, ??, D, .!?, E, .!?.!?, F, ..!!??]") - - checkDeterministicLexerTokenizer( - src = ",,A---C*D**E~~~~F==", - trg = "[,,, A, ---, C, *, D, **, E, ~~~~, F, ==]") - // was: trg = "[,,, A, ---, C*D, **, E, ~~~~, F, ==]") - - // dots in numbers - // Really? Do we want this? -akm -// checkDeterministicLexerTokenizer( -// src = ".1 a.1 2.3 4,5 6:7 8-9 0/1 '2 3's 3'4 5'b a'6 a'b", -// trg = "[.1, a.1, 2.3, 4,5, 6:7, 8-9, 0/1, '2, 3's, 3'4, 5'b, a'6, a'b]") - -// checkDeterministicLexerTokenizer( -// src = ".a a.3 4,a a:a a8-9 0/1a", -// trg = "[., a, a.3, 4, ,, a, a, :, a, a8-9, 0/1a]") - - // hyphens - checkDeterministicLexerTokenizer( - src = "dis-able cross-validation o-kay art-o-torium s-e-e art-work", - trg = "[dis-able, cross-validation, o-kay, art-o-torium, s-e-e, art, -, work]") - - // apostrophies - checkDeterministicLexerTokenizer( - src = "he's we'd I'm you'll they're I've didn't did'nt", - trg = "[he, 's, we, 'd, I, 'm, you, 'll, they, 're, I, 've, did, n't, did, 'nt]") - - checkDeterministicLexerTokenizer( - src = "he'S DON'T gue'ss", - trg = "[he, 'S, DO, N'T, gue, ', ss]") - // Was: trg = "[he, 'S, DO, N'T, gue'ss]") - - // Really? Do we want this? -akm -// checkDeterministicLexerTokenizer( -// src = "aint cannot don'cha d'ye i'mma dunno", -// trg = "[ai, nt, can, not, do, n', cha, d', ye, i, 'm, ma, du, n, no]") - - checkDeterministicLexerTokenizer( - src = "$1 E2 L3 USD1 2KPW ||$1 USD1..", - trg = "[$, 1, E2, L3, USD, 1, 2, KPW, |, |, $, 1, USD, 1, ..]") - - checkDeterministicLexerTokenizer( - src = "1m 2mm 3kg 4oz", - trg = "[1, m, 2, mm, 3, kg, 4, oz]") - - checkDeterministicLexerTokenizer( - src = "1D 2nM 3CM 4LB", - trg = "[1, D, 2, nM, 3, CM, 4, LB]") - - checkDeterministicLexerTokenizer( - src = "(1){2}[3]<4>", - trg = "[(, 1, ), {, 2, }, [, 3, ], <, 4, >]") - - // Really? Do we want this? -akm -// checkDeterministicLexerTokenizer( -// src = "`a'b,c:d;e-f/g\"h'", -// trg = "[`, a'b, ,, c, :, d, ;, e, -, f, /, g, \", h, ']") -// checkDeterministicLexerTokenizer( -// src = "`a'b,c:d;e-f/g\"h'", -// trg = "[`, a'b, ,, c, :, d, ;, e, -, f, /, g, \", h, ']") - - checkDeterministicLexerTokenizer( - src = "a@b #c$d%e&f|g", - trg = "[a@b, #, c$, d, %, e, &, f, |, g]") - // Was: trg = "[a@b, #, c, $, d, %, e, &, f, |, g]") - - checkDeterministicLexerTokenizer( - src = "e.g., i.e, (e.g.,", - trg = "[e.g., ,, i.e, ,, (, e.g., ,]") - - checkDeterministicLexerTokenizer( - src = " \n \t", - trg = "[]") - - checkDeterministicLexerTokenizer( - src = "\"John & Mary's dog,\" Jane thought (to herself).\n" + "\"What a #$%!\n" + "a- ``I like AT&T''.\"", - trg = "[\", John, &, Mary, 's, dog, ,, \", Jane, thought, (, to, herself, ), ., \", What, a, #, $, %, !, a, -, ``, I, like, AT&T, '', ., \"]") - - checkDeterministicLexerTokenizer( - src = "I said at 4:45pm. Never 2am.", - trg = "[I, said, at, 4:45, pm, ., Never, 2, am, .]") - - checkDeterministicLexerTokenizer( - src = "I can't believe they wanna keep 40% of that.\"``Whatcha think?''\"I don't --- think so...,\"", - trg = "[I, ca, n't, believe, they, wan, na, keep, 40, %, of, that, ., \", ``, What, cha, think, ?, '', \", I, do, n't, ---, think, so, ..., ,, \"]") - - checkDeterministicLexerTokenizer( - src = "You `paid' US$170,000?!\nYou should've paid only$16.75.", - trg = "[You, `, paid, ', US$, 170,000, ?!, You, should, 've, paid, only, $, 16.75, .]") - - checkDeterministicLexerTokenizer( - src = " 1. Buy a new Chevrolet (37%-owned in the U.S..) . 15%", - trg = "[1, ., Buy, a, new, Chevrolet, (, 37, %, -, owned, in, the, U.S., ., ), ., 15, %]") - - checkDeterministicLexerTokenizer( - src = "A. A.A.A.I. and U.S. in U.S.. etc., but not A... or A..B iPhone 3G in Washington D.C.", - trg = "[A., A.A.A.I., and, U.S., in, U.S., ., etc., ,, but, not, A, ..., or, A, .., B, iPhone, 3, G, in, Washington, D.C.]" - ) - - checkDeterministicLexerTokenizer( - src = "AT&T but don't grab LATÉ and be sure not to grab PE&gym", - trg = "[AT&T, but, do, n't, grab, LATÉ, and, be, sure, not, to, grab, PE, &, gym]" - ) - - /* A useful string for checking various options to the tokenizer (but not checked here) */ - checkDeterministicLexerTokenizer( - src = "2012-04-05 ethno-centric art-o-torium sure. thing", - trg = "[2012-04-05, ethno-centric, art-o-torium, sure, ., thing]" - ) - - checkDeterministicLexerTokenizer( - src = "Half of investors expect Greece to leave the euro zone.", - trg = "[Half, of, investors, expect, Greece, to, leave, the, euro, zone, .]" - ) - } - - @Test def testDeterministicNormalizingTokenizer(): Unit = { - checkDeterministicNormalizingTokenizer(DeterministicNormalizingTokenizer, - src = "''Going to the storé to grab . . . some cöfféé for \u20ac50. He`s right\u2026 – & \u2154 \\*\\* -- ™ — \u2015 \u0096 -- --- ..", - trg = "[\", Going, to, the, store, to, grab, ..., some, coffee, for, $, 50, ., He, ', s, right, ..., -, &, 2/3, **, --, (TM), --, --, -, --, --, ...]" - ) - } - - @Test def testDeterministicNormalizingHtmlTokenizer(): Unit = { - checkDeterministicNormalizingTokenizer(DeterministicNormalizingHtmlTokenizer, - src = "''Going to the storé to grab . . . some cöfféé for \u20ac50. He`s right\u2026 – & \u2154 \\*\\* -- ™ — \u2015 \u0096 -- --- ..", - trg = "[\", Going, to, the, store, to, grab, ..., some, coffee, for, $, 50, ., He, ', s, right, ..., -, &, 2/3, **, --, (TM), --, --, -, --, --, ...]" - ) - - checkDeterministicNormalizingTokenizer(DeterministicNormalizingHtmlTokenizer, - src = "\n

\nblah blah\n\n

\n", - trg = "[,

, blah, blah,

, ]" - ) - } - - val testText2 = - """ - The problem with MacroTypeTag is that it can be used outside macros. - - A fact about FullTypeTag that I don't like is that it implies that - it's somehow more full-fledged than TypeTag. - - What about ArbTypeTag (from arbitrary)? I agree the name is cryptic, - but at least it's not misleading and it doesn't imply that this type - tag carries more information that a vanilla TypeTag. - """.stripMargin - - @Test def testRegexes(): Unit = { - - val reg = new PunktSentenceSegmenter.Punkt.PunktLanguageVars() - val allMatches = reg.wordTokenizerRegex.findAllIn(testText2).toSeq - allMatches.foreach(s => logger.debug(s.toString)) - - assert(allMatches == Seq("The", "problem", "with", "MacroTypeTag", "is", "that", "it", "can", "be", "used", - "outside", "macros.", "A", "fact", "about", "FullTypeTag", "that", "I", "don", "'t", "like", "is", "that", "it", "implies", - "that", "it", "'s", "somehow", "more", "full-fledged", "than", "TypeTag.", "What", "about", "ArbTypeTag", "(", "from", - "arbitrary", ")", "?", "I", "agree", "the", "name", "is", "cryptic", ",", "but", "at", "least", "it", "'s", "not", - "misleading", "and", "it", "doesn", "'t", "imply", "that", "this", "type", "tag", "carries", "more", "information", - "that", "a", "vanilla", "TypeTag.")) - - } - - @Test def allTests(): Unit = { - - val testText1 = - """ - Something *less* subtle differentiates them. "f" is a method in *both* - cases. In the first case, it's a method with one parameter list of - arity 1 taking an Int, and returning an Int. I'll try to use the word U.S.A. In the second case, f is - a nullary method returning an Int => Int. - - Now, Int => Int, to be clear about it, is the same thing as Function1[Int, Int]. - - Methods are not values, functions are. - """.stripMargin - -// val tokenized1 = getTokenizedSentences(testText1) -// printTokenizedSentences(tokenized1) - -// val tokenized2 = getTokenizedSentences(testText2) -// printTokenizedSentences(tokenized2) - - val jointTokenized = getTokenizedSentences(Seq(testText1, testText2)) - - jointTokenized.foreach(printTokenizedSentences(_)) - - assert(jointTokenized(0).length == 7, jointTokenized(0).length) - assert(jointTokenized(1).length == 4, jointTokenized(1).length) - -// val noInference = getTokenizedSentences(Seq(testText1, testText2), Non) -// noInference.foreach(printTokenizedSentences(_)) - - val text = """ - Punkt knows that the periods in Mr. Smith and Johann S. Bach - do not mark sentence boundaries. And sometimes sentences - can start with non-capitalized words. i is a good variable - name. - """ - - val sampleTokenized = getTokenizedSentences(Seq(text)) - assert(sampleTokenized(0).length == 3, sampleTokenized(0).length) - - val moreText = - """ - President F.W. de Klerk released the ANC men -- along with one of the founding members of the Pan Africanist Congress, a rival liberation group -- - as part of his efforts to create a climate of trust and peace in which his government can begin negotiations with black leaders over a new constitution - aimed at giving blacks a voice in national government. But Pretoria may instead be creating a climate for more turmoil and uncertainty in this - racially divided country. As other repressive governments, particularly Poland and the Soviet Union, have recently discovered, initial steps to open - up society can create a momentum for radical change that becomes difficult, if not impossible, to control. As the days go by, the South African - government will be ever more hard pressed to justify the continued imprisonment of Mr. Mandela as well as the continued banning of the ANC and - enforcement of the state of emergency. If it does n't yield on these matters, and eventually begin talking directly to the ANC, the expectations - and promise raised by yesterday 's releases will turn to disillusionment and unrest. If it does, the large number of right-wing whites, who - oppose any concessions to the black majority, will step up their agitation and threats to take matters into their own hands. The newly released ANC - leaders also will be under enormous pressure. The government is watching closely to see if their presence in the townships leads to increased anti-government - protests and violence; if it does, Pretoria will use this as a reason to keep Mr. Mandela behind bars. Pretoria has n't forgotten why they were all - sentenced to life imprisonment in the first place: for sabotage and conspiracy to overthrow the government. In addition, the government is figuring - that the releases could create a split between the internal and external wings of the ANC and between the newly freed leaders and those activists - who have emerged as leaders inside the country during their imprisonment. In order to head off any divisions, Mr. Mandela, in a meeting with - his colleagues before they were released, instructed them to report to the ANC headquarters in Lusaka as soon as possible. The men also will be faced - with bridging the generation gap between themselves and the country 's many militant black youths, the so-called young lions who are anxious to see - the old lions in action. Says Peter Mokaba, president of the South African Youth Congress: `` `` We will be expecting them to act like leaders - of the ANC. '' They never considered themselves to be anything else. At last night 's rally, they called on their followers to be firm, - yet disciplined, in their opposition to apartheid. `` `` We emphasize discipline because we know that the government is very, very sensitive, '' said - Andrew Mlangeni, another early Umkhonto leader who is now 63. `` `` We want to see Nelson Mandela and all our comrades out of prison, and if we are n't - disciplined we may not see them here with us. - """ - - val moreTokenized = getTokenizedSentences(Seq(moreText)) - printTokenizedSentences(moreTokenized(0)) - assert(moreTokenized(0).length == 17, moreTokenized(0).length) - } - - def printTokenizedSentences(sentences: Seq[Sentence]): Unit = sentences.foreach(sen => logger.debug(sen.tokens.map(t => t.string))) - - def getTokenizedSentences(text: Seq[String], inference: SentenceBoundaryInference = JointlyAcrossDocuments): Seq[Seq[Sentence]] = { - val docs = text.map(t => new Document(t)) - new PunktTokenizer { override def sentenceBoundaryInference = inference }.process(docs) - docs.map(_.sentences.toSeq) - } - - /* We just want this to not cause a stack overflow exception */ - @Test def testDeterministicLexerTokenizerUnclosedLine(): Unit = { - val unclosedLine = "<" + ("x" * 1679) - val tokens = runTokenizer(DeterministicTokenizer, unclosedLine) - } - - /* We just want this to not cause a stack overflow exception */ - @Test def testDeterministicLexerTokenizerLongLine(): Unit = { - val hugeLine = """^M""" - val tokens = runTokenizer(DeterministicTokenizer, hugeLine) - } - - @Test def testRegexTokenizer(): Unit = { - assert(DeterministicTokenizer("Washington D.C.").toSeq == Seq("Washington", "D.C.")) - assert(DeterministicTokenizer("Acme Inc.").toSeq == Seq("Acme", "Inc.")) - assert(DeterministicTokenizer("Oct. 24").toSeq == Seq("Oct.", "24")) - assert(DeterministicTokenizer("Mr. Smith.").toSeq == Seq("Mr.", "Smith", ".")) - assert(DeterministicTokenizer("MR. SMITH.").toSeq == Seq("MR.", "SMITH", ".")) - assert(DeterministicTokenizer("mr. smith.").toSeq == Seq("mr.", "smith", ".")) // TODO Should this work? -akm - } - -} diff --git a/src/test/scala/cc/factorie/app/nlp/segment/TestPhraseTokenizer.scala b/src/test/scala/cc/factorie/app/nlp/segment/TestPhraseTokenizer.scala deleted file mode 100644 index e7aed06..0000000 --- a/src/test/scala/cc/factorie/app/nlp/segment/TestPhraseTokenizer.scala +++ /dev/null @@ -1,42 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.app.nlp.segment - -import org.junit.Assert._ -import org.junit.Test - -/** - * User: apassos - * Date: 8/19/13 - * Time: 1:22 PM - */ -class TestPhraseTokenizer { - @Test def testPhrases() { - val phrases = Seq(Seq("of", "cards"), Seq("New", "York", "City"), Seq("New", "York")) - val phraseTokenizer = new PhraseTokenizer(phrases) - val sampleDocument = new cc.factorie.app.nlp.Document("I built myself a house of cards in New York City, New York State.") - DeterministicNormalizingTokenizer.process(sampleDocument) - phraseTokenizer.process(sampleDocument) - val oldLength = sampleDocument.sections.length - assert(oldLength > 0) - val result = sampleDocument.attr[PhraseSectionList] - assertEquals(oldLength, result.length) - val tokens = result.head.tokens.map(_.string) - val expected = Seq("I", "built", "myself", "a", "house", "of cards", "in", "New York City", ",", "New York", "State", ".") - // println(tokens) - assertEquals(expected.length, tokens.length) - for ((e, t) <- expected.zip(tokens)) { - assertEquals(e, t) - } - } -} diff --git a/src/test/scala/cc/factorie/app/nlp/segment/TestRegexTokenizer.scala b/src/test/scala/cc/factorie/app/nlp/segment/TestRegexTokenizer.scala deleted file mode 100644 index 06b837f..0000000 --- a/src/test/scala/cc/factorie/app/nlp/segment/TestRegexTokenizer.scala +++ /dev/null @@ -1,314 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -package cc.factorie.app.nlp.segment - -import cc.factorie.app.nlp.{Document, DocumentAnnotatorPipeline, Sentence, Token} -import cc.factorie.util.FastLogging -import org.junit.Test -import org.scalatest.junit.JUnitSuite - -class TestRegexTokenizer extends JUnitSuite with FastLogging { - - def assertEquals(a: Any, b: Any): Unit = assert(a == b, "\"%s\" did not equal \"%s\"" format (a, b)) - - @Test def testSentenceSegmenter(): Unit = { - val text = - """ - I got bored of the iPhone. This has been a conundrum for for the past few months. I wanted a new phone, but didn't know how to transcend away from the iOS environment. I woke up one morning, and said, "whatever, I don't care anymore," and walked into BestBuy(save the hate, I had a $25 coupon and $35 gift card) and bought the Note 2. I have been wanting this phone since I first hear about it. - - Why I made this decision. I love tech stuff, and the iphone, to me, was getting boring. My early upgrade just came through, so I could test a new phone, and still be within the launch period of the next gen iPhone. Having gone from the iPhone 3G, to Iphone4, to the 4S, you would think I would be drawn to the iPhone5 right? No. It did nothing for me. I don't even know how to explain it. - - These are some things that worried me about switching. I have a Mac, and though I use Windows7 on BC, I still want my phone to sync natively to my Mac. The worry that syncing wouldn't be as smooth as the iPhone had me. I don't think there's another phone in existence that syncs info as well as the iPhone on an Mac. I had gotten used to easily syncing EVERYTHING in one go with Itunes and iPhoto, but I decided to just go with it and use it as an experience. - - Now that I actually own the Note 2, and more specifically an Android phone, I actually have a better understanding of the OS quality provided by Apple. - However, with that said, the Android 4.1 is awesome. Better than anything to come before it from Android(obviously, right?). This phone is an absolute MONSTER! - - I now use my iphone as an alarm clock and is the bluetooth source to play music in my car. - """.stripMargin - val d = new Document((1 to 2).map(_ => text).mkString("\n")) - DocumentAnnotatorPipeline(DeterministicRegexTokenizer, DeterministicSentenceSegmenter).process(d) - assert(d.sentences.size > 0) - d.sentences.map(_.string).foreach(s => logger.debug(s.toString)) - } - - @Test def testSentenceSegmenterWithOneSentence() { - val text = "The quick brown fox jumps over the lazy dog." - val d = new Document(text) - DocumentAnnotatorPipeline(DeterministicRegexTokenizer, DeterministicSentenceSegmenter).process(d) - assert(d.sentences.size == 1) - assert(d.tokens.size == 10) - } - - private def runDeterministicRegexTokenizer(src: String): Iterable[Token] = { - val d = new Document(src) - DocumentAnnotatorPipeline(DeterministicRegexTokenizer).process(d).tokens - } - - @Test def testDeterministicRegexTokenizer(): Unit = { - def check(src: String, trg: String): Unit = { - val tokens = runDeterministicRegexTokenizer(src) - for (t <- tokens) { - assertEquals(t.string, src.substring(t.stringStart, t.stringEnd)) - } - assertEquals("[" + tokens.map(_.string).mkString(", ") + "]", trg) - } - - // spaces - check( - src = "a b c\n d \t\n\r\fe", - trg = "[a, b, c, d, e]") - - // emoticons - check( - src = ":-)))) :----) :( :-) :--)", - trg = "[:-)))), :----), :(, :-), :--)]") - - // URLs - check( - src = "|http://www.google.com|www.google.com|mailto:somebody@google.com|some-body@google+.com|", - trg = "[|, http://www.google.com, |, www.google.com, |, mailto:somebody@google.com, |, some-body@google+.com, |]") - - check( - src = "google.com index.html a.b.htm ab-cd.shtml", - trg = "[google.com, index.html, a.b.htm, ab-cd.shtml]") - - // abbreviations - check( - src = "prof. ph.d. a. a.b. a.b a.b.c. men.cd ab.cd", - trg = "[prof., ph.d., a., a.b., a.b, a.b.c., men, ., cd, ab, ., cd]") - - // consecutive punctuation - check( - src = "A..B!!C??D.!?E.!?.!?F..!!??", - trg = "[A, .., B, !!, C, ??, D, .!?, E, .!?.!?, F, ..!!??]") - - check( - src = ",,A---C*D**E~~~~F==", - trg = "[,,, A, ---, C, *, D, **, E, ~~~~, F, ==]") - // was: trg = "[,,, A, ---, C*D, **, E, ~~~~, F, ==]") - - // dots in numbers - // Really? Do we want this? -akm -// check( -// src = ".1 a.1 2.3 4,5 6:7 8-9 0/1 '2 3's 3'4 5'b a'6 a'b", -// trg = "[.1, a.1, 2.3, 4,5, 6:7, 8-9, 0/1, '2, 3's, 3'4, 5'b, a'6, a'b]") - -// check( -// src = ".a a.3 4,a a:a a8-9 0/1a", -// trg = "[., a, a.3, 4, ,, a, a, :, a, a8-9, 0/1a]") - - // hyphens - check( - src = "dis-able cross-validation o-kay art-o-torium s-e-e art-work", - trg = "[dis-able, cross-validation, o-kay, art-o-torium, s-e-e, art, -, work]") - - // apostrophies - check( - src = "he's we'd I'm you'll they're I've didn't did'nt", - trg = "[he, 's, we, 'd, I, 'm, you, 'll, they, 're, I, 've, did, n't, did, 'nt]") - - check( - src = "he'S DON'T gue'ss", - trg = "[he, 'S, DO, N'T, gue, ', ss]") - // Was: trg = "[he, 'S, DO, N'T, gue'ss]") - - // Really? Do we want this? -akm -// check( -// src = "aint cannot don'cha d'ye i'mma dunno", -// trg = "[ai, nt, can, not, do, n', cha, d', ye, i, 'm, ma, du, n, no]") - - check( - src = "$1 E2 L3 USD1 2KPW ||$1 USD1..", - trg = "[$, 1, E2, L3, USD, 1, 2, KPW, |, |, $, 1, USD, 1, ..]") - - check( - src = "1m 2mm 3kg 4oz", - trg = "[1, m, 2, mm, 3, kg, 4, oz]") - - check( - src = "1D 2nM 3CM 4LB", - trg = "[1, D, 2, nM, 3, CM, 4, LB]") - - check( - src = "(1){2}[3]<4>", - trg = "[(, 1, ), {, 2, }, [, 3, ], <, 4, >]") - - // Really? Do we want this? -akm -// check( -// src = "`a'b,c:d;e-f/g\"h'", -// trg = "[`, a'b, ,, c, :, d, ;, e, -, f, /, g, \", h, ']") -// check( -// src = "`a'b,c:d;e-f/g\"h'", -// trg = "[`, a'b, ,, c, :, d, ;, e, -, f, /, g, \", h, ']") - - check( - src = "a@b #c$d%e&f|g", - trg = "[a@b, #, c$, d, %, e, &, f, |, g]") - // Was: trg = "[a@b, #, c, $, d, %, e, &, f, |, g]") - - check( - src = "e.g., i.e, (e.g.,", - trg = "[e.g., ,, i.e, ,, (, e.g., ,]") - - check( - src = " \n \t", - trg = "[]") - - check( - src = "\"John & Mary's dog,\" Jane thought (to herself).\n" + "\"What a #$%!\n" + "a- ``I like AT&T''.\"", - trg = "[\", John, &, Mary, 's, dog, ,, \", Jane, thought, (, to, herself, ), ., \", What, a, #, $, %, !, a, -, ``, I, like, AT&T, '', ., \"]") - - check( - src = "I said at 4:45pm. Never 2am.", - trg = "[I, said, at, 4:45, pm, ., Never, 2, am, .]") - - check( - src = "I can't believe they wanna keep 40% of that.\"``Whatcha think?''\"I don't --- think so...,\"", - trg = "[I, ca, n't, believe, they, wan, na, keep, 40, %, of, that, ., \", ``, What, cha, think, ?, '', \", I, do, n't, ---, think, so, ..., ,, \"]") - - check( - src = "You `paid' US$170,000?!\nYou should've paid only$16.75.", - trg = "[You, `, paid, ', US$, 170,000, ?!, You, should, 've, paid, only, $, 16.75, .]") - - check( - src = " 1. Buy a new Chevrolet (37%-owned in the U.S..) . 15%", - trg = "[1, ., Buy, a, new, Chevrolet, (, 37, %, -, owned, in, the, U.S., ., ), ., 15, %]") - } - - val testText2 = - """ - The problem with MacroTypeTag is that it can be used outside macros. - - A fact about FullTypeTag that I don't like is that it implies that - it's somehow more full-fledged than TypeTag. - - What about ArbTypeTag (from arbitrary)? I agree the name is cryptic, - but at least it's not misleading and it doesn't imply that this type - tag carries more information that a vanilla TypeTag. - """.stripMargin - - @Test def testRegexes(): Unit = { - - val reg = new PunktSentenceSegmenter.Punkt.PunktLanguageVars() - val allMatches = reg.wordTokenizerRegex.findAllIn(testText2).toSeq - allMatches.foreach(s => logger.debug(s.toString)) - - assert(allMatches == Seq("The", "problem", "with", "MacroTypeTag", "is", "that", "it", "can", "be", "used", - "outside", "macros.", "A", "fact", "about", "FullTypeTag", "that", "I", "don", "'t", "like", "is", "that", "it", "implies", - "that", "it", "'s", "somehow", "more", "full-fledged", "than", "TypeTag.", "What", "about", "ArbTypeTag", "(", "from", - "arbitrary", ")", "?", "I", "agree", "the", "name", "is", "cryptic", ",", "but", "at", "least", "it", "'s", "not", - "misleading", "and", "it", "doesn", "'t", "imply", "that", "this", "type", "tag", "carries", "more", "information", - "that", "a", "vanilla", "TypeTag.")) - - } - - @Test def allTests(): Unit = { - - val testText1 = - """ - Something *less* subtle differentiates them. "f" is a method in *both* - cases. In the first case, it's a method with one parameter list of - arity 1 taking an Int, and returning an Int. I'll try to use the word U.S.A. In the second case, f is - a nullary method returning an Int => Int. - - Now, Int => Int, to be clear about it, is the same thing as Function1[Int, Int]. - - Methods are not values, functions are. - """.stripMargin - -// val tokenized1 = getTokenizedSentences(testText1) -// printTokenizedSentences(tokenized1) - -// val tokenized2 = getTokenizedSentences(testText2) -// printTokenizedSentences(tokenized2) - - val jointTokenized = getTokenizedSentences(Seq(testText1, testText2)) - - jointTokenized.foreach(printTokenizedSentences(_)) - - assert(jointTokenized(0).length == 7, jointTokenized(0).length) - assert(jointTokenized(1).length == 4, jointTokenized(1).length) - -// val noInference = getTokenizedSentences(Seq(testText1, testText2), Non) -// noInference.foreach(printTokenizedSentences(_)) - - val text = """ - Punkt knows that the periods in Mr. Smith and Johann S. Bach - do not mark sentence boundaries. And sometimes sentences - can start with non-capitalized words. i is a good variable - name. - """ - - val sampleTokenized = getTokenizedSentences(Seq(text)) - assert(sampleTokenized(0).length == 3, sampleTokenized(0).length) - - val moreText = - """ - President F.W. de Klerk released the ANC men -- along with one of the founding members of the Pan Africanist Congress, a rival liberation group -- - as part of his efforts to create a climate of trust and peace in which his government can begin negotiations with black leaders over a new constitution - aimed at giving blacks a voice in national government. But Pretoria may instead be creating a climate for more turmoil and uncertainty in this - racially divided country. As other repressive governments, particularly Poland and the Soviet Union, have recently discovered, initial steps to open - up society can create a momentum for radical change that becomes difficult, if not impossible, to control. As the days go by, the South African - government will be ever more hard pressed to justify the continued imprisonment of Mr. Mandela as well as the continued banning of the ANC and - enforcement of the state of emergency. If it does n't yield on these matters, and eventually begin talking directly to the ANC, the expectations - and promise raised by yesterday 's releases will turn to disillusionment and unrest. If it does, the large number of right-wing whites, who - oppose any concessions to the black majority, will step up their agitation and threats to take matters into their own hands. The newly released ANC - leaders also will be under enormous pressure. The government is watching closely to see if their presence in the townships leads to increased anti-government - protests and violence; if it does, Pretoria will use this as a reason to keep Mr. Mandela behind bars. Pretoria has n't forgotten why they were all - sentenced to life imprisonment in the first place: for sabotage and conspiracy to overthrow the government. In addition, the government is figuring - that the releases could create a split between the internal and external wings of the ANC and between the newly freed leaders and those activists - who have emerged as leaders inside the country during their imprisonment. In order to head off any divisions, Mr. Mandela, in a meeting with - his colleagues before they were released, instructed them to report to the ANC headquarters in Lusaka as soon as possible. The men also will be faced - with bridging the generation gap between themselves and the country 's many militant black youths, the so-called young lions who are anxious to see - the old lions in action. Says Peter Mokaba, president of the South African Youth Congress: `` `` We will be expecting them to act like leaders - of the ANC. '' They never considered themselves to be anything else. At last night 's rally, they called on their followers to be firm, - yet disciplined, in their opposition to apartheid. `` `` We emphasize discipline because we know that the government is very, very sensitive, '' said - Andrew Mlangeni, another early Umkhonto leader who is now 63. `` `` We want to see Nelson Mandela and all our comrades out of prison, and if we are n't - disciplined we may not see them here with us. - """ - - val moreTokenized = getTokenizedSentences(Seq(moreText)) - printTokenizedSentences(moreTokenized(0)) - assert(moreTokenized(0).length == 17, moreTokenized(0).length) - } - - def printTokenizedSentences(sentences: Seq[Sentence]): Unit = sentences.foreach(sen => logger.debug(sen.tokens.map(t => t.string))) - - def getTokenizedSentences(text: Seq[String], inference: SentenceBoundaryInference = JointlyAcrossDocuments): Seq[Seq[Sentence]] = { - val docs = text.map(t => new Document(t)) - new PunktTokenizer { override def sentenceBoundaryInference = inference }.process(docs) - docs.map(_.sentences.toSeq) - } - - @Test def testDeterministicRegexTokenizerUnclosedLine(): Unit = { - val unclosedLine = "<" + ("x" * 1679) - val tokens = runDeterministicRegexTokenizer(unclosedLine) - } - - @Test def testDeterministicRegexTokenizerLongLine(): Unit = { - val hugeLine = """^M""" - val tokens = runDeterministicRegexTokenizer(hugeLine) - } - - @Test def testRegexTokenizer(): Unit = { - assert(DeterministicRegexTokenizer("Washington D.C.").toSeq == Seq("Washington", "D.C.")) - assert(DeterministicRegexTokenizer("Acme Inc.").toSeq == Seq("Acme", "Inc.")) - assert(DeterministicRegexTokenizer("Oct. 24").toSeq == Seq("Oct.", "24")) - assert(DeterministicRegexTokenizer("Mr. Smith.").toSeq == Seq("Mr.", "Smith", ".")) - //println(RegexTokenizer("MR. SMITH.").mkString(" ")) - //assert(RegexTokenizer("MR. SMITH.").toSeq == Seq("MR.", "SMITH", ".")) // TODO It would be nice if this worked. - //assert(RegexTokenizer("mr. smith.").toSeq != Seq("mr.", "smith", ".")) // TODO Should this work? -akm - } - -} diff --git a/src/test/scala/cc/factorie/app/regress/TestRegression.scala b/src/test/scala/cc/factorie/app/regress/TestRegression.scala deleted file mode 100644 index 8ff345a..0000000 --- a/src/test/scala/cc/factorie/app/regress/TestRegression.scala +++ /dev/null @@ -1,82 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.app.regress - -import cc.factorie.la._ -import cc.factorie.optimize.OptimizableObjectives -import cc.factorie.variable.{DiffList, TensorVariable} -import org.junit.Assert._ -import org.junit.Test - -/** - * @author apassos - * @since 9/15/12 - */ - -class MyTensorVariable(x0: Double, x1: Double, y: Double)(implicit d: DiffList = null) extends TensorVariable[Tensor1] { - set(new DenseTensor1(1)) - value(0) = y - - val inner = new TensorVariable[Tensor1] - inner.set(new DenseTensor1(2)) - inner(0) = x0 - inner(1) = x1 - - def getFeatures = inner -} - -class TestRegression { - @Test def testSimpleRegression() { - // y = 2*x0 + x1 - val y0 = new MyTensorVariable(1, 2, 4) - val y1 = new MyTensorVariable(2, 1, 5) - val y2 = new MyTensorVariable(1, 1, 3) - - val regressor = LinearRegressionTrainer.train[TensorVariable[Tensor1], MyTensorVariable](Seq(y0, y1, y2), f => f.getFeatures, 0.0) - assertEquals(4, regressor.regress(y0).dependantValue(0), 0.01) - assertEquals(5, regressor.regress(y1).dependantValue(0), 0.01) - assertEquals(3, regressor.regress(y2).dependantValue(0), 0.01) - - val regressor2 = LinearRegressionTrainer.train[TensorVariable[Tensor1], MyTensorVariable](Seq(y0, y1, y2), f => f.getFeatures, 0.0, OptimizableObjectives.epsilonInsensitiveSqMultivariate(0.001)) - assertEquals(4, regressor2.regress(y0).dependantValue(0), 0.01) - assertEquals(5, regressor2.regress(y1).dependantValue(0), 0.01) - assertEquals(3, regressor2.regress(y2).dependantValue(0), 0.01) - } -} - -class TestLinearRegressor { - - class MyTensorVariable(x0: Double, x1: Double, y: Double)(implicit d: DiffList = null) extends TensorVariable[Tensor1] { - // the target value - set(new DenseTensor1(Array(y))) - // the dependent values - val features = new TensorVariable[Tensor1](new DenseTensor1(Array(x0, x1))) - } - - @Test - def testLinearRegressor { - // y = 2*x_0 + x_1 - val weights = new DenseTensor2(Array(Array(2.0), Array(1.0))) - val r = new LinearRegressor[TensorVariable[Tensor1], MyTensorVariable](v=>v.features, weights) - - // y = 2*3 + 1*4 = 10 - val result = r.regress(new MyTensorVariable(3, 4, 0)) - assertEquals(10, result.dependantValue(0), 0.01) - - // perform multiple regressions - val results = r.regress(Seq(new MyTensorVariable(3,4,0), new MyTensorVariable(5,6,0))) - assertEquals(10, results(0).dependantValue(0), 0.01) - assertEquals(16, results(1).dependantValue(0), 0.01) - - } -} diff --git a/src/test/scala/cc/factorie/app/uschema/TestCoocMatrix.scala b/src/test/scala/cc/factorie/app/uschema/TestCoocMatrix.scala deleted file mode 100644 index 5f6c7a9..0000000 --- a/src/test/scala/cc/factorie/app/uschema/TestCoocMatrix.scala +++ /dev/null @@ -1,266 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.app.uschema - -import cc.factorie.util -import org.junit.Assert._ -import org.scalatest.junit.JUnitSuite -import org.junit.Test -import com.github.fakemongo.Fongo -import com.mongodb.{DBCollection, BasicDBObject, DB} -import cc.factorie.la.SparseIndexedTensor2 -import scala.util.Random - - -/** - * Created by beroth on 1/30/15. - */ -class TestCoocMatrix extends JUnitSuite with util.FastLogging { - - val eps = 1e-4 - - @Test def getSetCellsTest() { - val m = new CoocMatrix(0,0) - m.set(0,0,1.0) - m.set(4,2,3.0) - m.set(1,3,1.0) - m.set(4,2,2.0) - assertEquals(m.numRows(),5) - assertEquals(m.numCols(),4) - assertEquals(m.get(1, 3),1.0, eps) - assertEquals(m.get(4, 2),2.0, eps) - assertEquals(m.get(2, 2),0.0, eps) - assertEquals(m.get(5, 5),0.0, eps) - assertEquals(m.nnz(), 3) - m.set(1,3,0) - assertEquals(m.nnz(), 2) - m.set(1,3,0) - assertEquals(m.nnz(), 2) - m.set(5,4,0) - assertEquals(m.nnz(), 2) - assertEquals(m.numRows(),6) - assertEquals(m.numCols(),5) - } - - - @Test def copyTest() { - val m = new CoocMatrix(0,0) - m.set(0,0,1.0) - m.set(1,3,1.0) - m.set(4,2,2.0) - val m2 = m.copy() - assertTrue(m.hasSameContent(m2)) - m2.set(1,3,0) - assertFalse(m.hasSameContent(m2)) - assertFalse(m.getNnzCells().toSet == m2.getNnzCells().toSet) - } - - @Test def pruneMatrixTest() { - val m = new CoocMatrix(0,0) - m.set(1,1,1.0) - m.set(2,2,1.0) - m.set(2,3,1.0) - m.set(3,3,1.0) - - val (m0, rowMap0, colMap0) = m.prune(0,0) - // pruned matrix only contains biggest component, i.e. rows 2 and 3, and columns 2 and 3 - assertEquals(m0.numRows(), 2) - assertEquals(m0.numCols(), 2) - - assertFalse(rowMap0.contains(0)) - assertFalse(colMap0.contains(0)) - assertFalse(rowMap0.contains(1)) - assertFalse(colMap0.contains(1)) - assertTrue(rowMap0.contains(2)) - assertTrue(colMap0.contains(2)) - assertTrue(rowMap0.contains(3)) - assertTrue(colMap0.contains(3)) - - // check that the columns are mapped with the order preserved - assertEquals(colMap0(2), 0) - assertEquals(colMap0(3), 1) - assertEquals(rowMap0(2), 0) - assertEquals(rowMap0(3), 1) - - val (m1, rowMap1, colMap1) = m.prune(0,1) - assertEquals(2, m1.numRows()) - assertEquals(1, m1.numCols()) - assertFalse(colMap1.contains(0)) - assertFalse(colMap1.contains(1)) - assertFalse(colMap1.contains(2)) - assertEquals(0, colMap1(3)) - assertFalse(rowMap1.contains(0)) - assertFalse(rowMap1.contains(1)) - assertEquals(0, rowMap1(2)) - assertEquals(1, rowMap1(3)) - } - - @Test def equalsTest() { - val m1 = new CoocMatrix(0,0) - m1.set(0,0,1.0) - m1.set(0,1,1.0) - m1.set(0,2,1.0) - m1.set(0,3,1.0) - m1.set(4,2,3.0) - m1.set(1,3,1.0) - m1.set(4,2,2.0) - - val m2 = new CoocMatrix(0,0) - m2.set(4,2,2.0) - m2.set(1,3,1.0) - m2.set(0,3,1.0) - m2.set(0,2,1.0) - m2.set(0,1,1.0) - m2.set(0,0,1.0) - - val m3 = new CoocMatrix(0,0) - m3.set(4,2,2.0) - m3.set(1,3,1.0) - m3.set(0,0,1.0) - - assertTrue(m1.hasSameContent(m2)) - assertTrue(m2.hasSameContent(m1)) - assertTrue(m3.hasSameContent(m3)) - assertFalse(m1.hasSameContent(m3)) - assertFalse(m3.hasSameContent(m1)) - } - - @Test def writeReadMongoTest() { - // Fake in-memory mongo server. - val fongo = new Fongo("myserver"); - val db : DB = fongo.getDB("mydb"); - - val m1 = new CoocMatrix(0,0) - m1.set(0,0,1.0) - m1.set(0,1,1.0) - m1.set(0,2,1.0) - m1.set(0,3,1.0) - m1.set(4,2,3.0) - m1.set(1,3,1.0) - m1.set(4,2,2.0) - - m1.writeToMongo(db) - - val m2 = new CoocMatrix(0,0) - m2.populateFromMongo(db) - assertTrue(m1.hasSameContent(m2)) - } - - /* - @Test def writeReadMongoCellBasedTest() { - // Fake in-memory mongo server. - val fongo = new Fongo("myserver"); - val db : DB = fongo.getDB("mydb"); - - val m1 = new CoocMatrix(0,0) - m1.set(0,0,1.0) - m1.set(0,1,1.0) - m1.set(0,2,1.0) - m1.set(0,3,1.0) - m1.set(4,2,3.0) - m1.set(1,3,1.0) - m1.set(4,2,2.0) - - m1.writeToMongoCellBased(db) - - val m2 = CoocMatrix.fromMongoCellBased(db) - assertTrue(m1.hasSameContent(m2)) - } - */ - - @Test def testSplitTest() { - //0101 - //1101 - //0010 - //1101 - val m = new CoocMatrix(0,0) - m.set(0,1,1.0) - m.set(0,3,1.0) - m.set(1,0,1.0) - m.set(1,1,1.0) - m.set(1,3,1.0) - m.set(2,2,1.0) - m.set(3,0,1.0) - m.set(3,1,1.0) - m.set(3,3,1.0) - // Just use rows and cols 1,2,3 for testing purposes - val testRows = Set(1,2,3) - val testCols = Set(1,2,3) - - // Make sure that test passes for different random initialiaztions - for (seed <- 0 until 10) { - val random = new Random(seed) - val (mtrain, mdev, mtest) = m.randomTestSplit(2,3,Some(testRows), Some(testCols), random) - // Cell 2,2 is not elegible, so there are only 2 cells left for test set - assertFalse(mtest.getNnzCells().toSet.contains((2,2))) - assertFalse(mdev.getNnzCells().toSet.contains((2,2))) - assertEquals(2,mdev.nnz()) - assertEquals(2,mtest.nnz()) - assertEquals(5,mtrain.nnz()) - // the 3 matrices are a partitoning of m: - // 1. their size is 2+2+5 = 9 - // 2. they contain all elements - assertEquals(m.getNnzCells().toSet, mtrain.getNnzCells().toSet ++ mtest.getNnzCells().toSet ++ mdev.getNnzCells().toSet) - } - } - - @Test def testSplitRandomizedTest() { - val numRows = 1000 - val numCols = 100 - val nnz = 10000 - val numDevNNZ = 100 - val numTestNNZ = 150 - - val numTopics = 1 - val noise1 = 0.1 - for (seed <- 0 until 10) { - val random = new Random(seed) - val m = CoocMatrix.randomOneZeroMatrix(numRows, numCols, nnz, random, numTopics, noise1) - val (mTrain,mDev,mTest) = m.randomTestSplit(numDevNNZ, numTestNNZ, None, Some(Set(0,1,2,3,4,5,6,7,8,9)), random) - assertEquals(numDevNNZ, mDev.nnz()) - assertEquals(numTestNNZ, mTest.nnz()) - assertEquals(m.nnz(), mTrain.nnz() + mDev.nnz() + mTest.nnz()) - } - } - - @Test def randomMatrixTest() { - val numRows = 1000 - val numCols = 1000 - val nnz = 10000 - val numTopics = 10 - val noise1 = 0.1 - for (seed <- 0 until 10) { - val random = new Random(seed) - val m = CoocMatrix.randomOneZeroMatrix(numRows, numCols, nnz, random, numTopics, noise1) - // non-zeros roughly as specified - assertTrue(m.nnz() <= nnz) - assertTrue(m.nnz() > 0.9 * nnz) - val noiseCells = m.getNnzCells().filter(cell => (cell._1 % numTopics != cell._2 % numTopics)) - // Ratio of noise roughly as specified - assertEquals(noiseCells.size / m.nnz().toDouble, noise1, 0.05) - } - } - - @Test def readFromTensor2Test() { - val t2 = new SparseIndexedTensor2(10, 10) - t2.+=(0, 2, 3.0) - t2.+=(0,0, 5.0) - t2.+=(3, 0, 7.0) - t2.+=(5, 9, 10.0) - val m = CoocMatrix.fromTensor2(t2) - assert(m.get(0,2) == 3.0) - assert(m.get(0,0) == 5.0) - assert(m.get(3,0) == 7.0) - assert(m.get(5,9) == 10.0) - } -} diff --git a/src/test/scala/cc/factorie/app/uschema/TestEntityRelationKBMatrix.scala b/src/test/scala/cc/factorie/app/uschema/TestEntityRelationKBMatrix.scala deleted file mode 100644 index d99d0d8..0000000 --- a/src/test/scala/cc/factorie/app/uschema/TestEntityRelationKBMatrix.scala +++ /dev/null @@ -1,170 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.app.uschema - -import org.scalatest.junit.JUnitSuite -import cc.factorie.util -import org.junit.Test -import org.junit.Assert._ -import com.github.fakemongo.Fongo -import com.mongodb.DB -import scala.util.Random - -/** - * Created by beroth on 2/6/15. - */ -class TestEntityRelationKBMatrix extends JUnitSuite with util.FastLogging { - - val eps = 1e-4 - - @Test def getSetCellsTest() { - val m = new EntityRelationKBMatrix() - m.set(EntityPair("Barack Obama", "Michelle Obama"), "is married to", 5.0) - m.set(EntityPair("Barack Obama", "Michelle Obama"), "is married to", 10.0) - m.set(EntityPair("Barack Obama", "Michelle Obama"), "per:spouse", 1.0) - m.set(EntityPair("Frank Sinatra", "Nancy Barbato"), "per:spouse", 1.0) - m.set(EntityPair("Frank Sinatra", "Nancy Barbato"), "and his wife", 2.0) - assertEquals(m.numRows(),2) - assertEquals(m.numCols(),3) - assertEquals(10.0, m.get(EntityPair("Barack Obama", "Michelle Obama"), "is married to"), eps) - assertEquals(1.0, m.get(EntityPair("Barack Obama", "Michelle Obama"), "per:spouse"), eps) - assertEquals(1.0, m.get(EntityPair("Frank Sinatra", "Nancy Barbato"), "per:spouse"), eps) - assertEquals(2.0, m.get(EntityPair("Frank Sinatra", "Nancy Barbato"), "and his wife"), eps) - assertEquals(0.0, m.get(EntityPair("Nicola Sarcozy", "Carla Bruni"), "per:spouse"), eps) - assertEquals(0.0, m.get(EntityPair("Barack Obama", "Michelle Obama"), "and his wife"), eps) - } - - @Test def equalsTest() { - val m1 = new EntityRelationKBMatrix() - m1.set(EntityPair("Barack Obama", "Michelle Obama"), "is married to", 5.0) - m1.set(EntityPair("Barack Obama", "Michelle Obama"), "is married to", 10.0) - m1.set(EntityPair("Barack Obama", "Michelle Obama"), "per:spouse", 1.0) - m1.set(EntityPair("Frank Sinatra", "Nancy Barbato"), "per:spouse", 1.0) - m1.set(EntityPair("Frank Sinatra", "Nancy Barbato"), "and his wife", 2.0) - - // same as m1, but constructed in different order - val m2 = new EntityRelationKBMatrix() - m2.set(EntityPair("Frank Sinatra", "Nancy Barbato"), "and his wife", 2.0) - m2.set(EntityPair("Frank Sinatra", "Nancy Barbato"), "per:spouse", 1.0) - m2.set(EntityPair("Barack Obama", "Michelle Obama"), "per:spouse", 1.0) - m2.set(EntityPair("Barack Obama", "Michelle Obama"), "is married to", 10.0) - - // similar to m2, but one different cell value - val m3 = new EntityRelationKBMatrix() - m3.set(EntityPair("Frank Sinatra", "Nancy Barbato"), "and his wife", 1.0) - m3.set(EntityPair("Frank Sinatra", "Nancy Barbato"), "per:spouse", 1.0) - m3.set(EntityPair("Barack Obama", "Michelle Obama"), "per:spouse", 1.0) - m3.set(EntityPair("Barack Obama", "Michelle Obama"), "is married to", 10.0) - - // different rows/columns - val m4 = new EntityRelationKBMatrix() - m4.set(EntityPair("Frank Sinatra", "Nancy Barbato"), "and his wife", 2.0) - m4.set(EntityPair("Frank Sinatra", "Nancy Barbato"), "per:spouse", 1.0) - m4.set(EntityPair("Nicola Sarcozy", "Carla Bruni"), "per:spouse", 1.0) - m4.set(EntityPair("Barack Obama", "Michelle Obama"), "and his wife", 1.0) - - assertTrue(m1.hasSameContent(m2)) - assertTrue(m2.hasSameContent(m1)) - assertFalse(m2.hasSameContent(m3)) - assertFalse(m3.hasSameContent(m4)) - } - - @Test def writeReadMongoTest() { - // Fake in-memory mongo server. - val fongo = new Fongo("myserver"); - val db : DB = fongo.getDB("mydb"); - - val m1 = new EntityRelationKBMatrix() - m1.set(EntityPair("Frank Sinatra", "Nancy Barbato"), "and his wife", 2.0) - m1.set(EntityPair("Frank Sinatra", "Nancy Barbato"), "per:spouse", 1.0) - m1.set(EntityPair("Barack Obama", "Michelle Obama"), "per:spouse", 1.0) - m1.set(EntityPair("Barack Obama", "Michelle Obama"), "is married to", 10.0) - - m1.writeToMongo(db) - - val m2 = new EntityRelationKBMatrix - m2.populateFromMongo(db) - assertTrue(m1.hasSameContent(m2)) - } - - @Test def pruneMatrixTest() { - val m = new EntityRelationKBMatrix() - m.set(EntityPair("Barack Obama", "Michelle Obama"), "is married to", 1.0) - m.set(EntityPair("Frank Sinatra", "Nancy Barbato"), "per:spouse", 1.0) - m.set(EntityPair("Frank Sinatra", "Nancy Barbato"), "and his wife", 1.0) - m.set(EntityPair("Nicola Sarcozy", "Carla Bruni"), "and his wife", 1.0) - - val m0 = m.prune(0,0) - - val m0goal = new EntityRelationKBMatrix() - m0goal.set(EntityPair("Frank Sinatra", "Nancy Barbato"), "per:spouse", 1.0) - m0goal.set(EntityPair("Frank Sinatra", "Nancy Barbato"), "and his wife", 1.0) - m0goal.set(EntityPair("Nicola Sarcozy", "Carla Bruni"), "and his wife", 1.0) - - assertTrue(m0.hasSameContent(m0goal)) - - val m1 = m.prune(0,1) - val m1goal = new EntityRelationKBMatrix() - m1goal.set(EntityPair("Frank Sinatra", "Nancy Barbato"), "and his wife", 1.0) - m1goal.set(EntityPair("Nicola Sarcozy", "Carla Bruni"), "and his wife", 1.0) - - assertTrue(m1.hasSameContent(m1goal)) - - val m2 = m.prune(1,0) - val m2goal = new EntityRelationKBMatrix() - m2goal.set(EntityPair("Frank Sinatra", "Nancy Barbato"), "per:spouse", 1.0) - m2goal.set(EntityPair("Frank Sinatra", "Nancy Barbato"), "and his wife", 1.0) - assertTrue(m2.hasSameContent(m2goal)) - } - - @Test def testSplitTest() { - //0101 - //1101 - //0010 - //1101 - val m = new EntityRelationKBMatrix() - m.set(EntityPair("A", "A"), "1",1.0) - m.set(EntityPair("A", "A"), "3",1.0) - - m.set(EntityPair("B", "B"), "0",1.0) - m.set(EntityPair("B", "B"), "1",1.0) - m.set(EntityPair("B", "B"), "3",1.0) - - m.set(EntityPair("C", "C"), "2",1.0) - - m.set(EntityPair("D", "D"), "0",1.0) - m.set(EntityPair("D", "D"), "1",1.0) - m.set(EntityPair("D", "D"), "3",1.0) - // Just use rows and cols 1,2,3 for testing purposes - val testRows = Set(EntityPair("B", "B"), EntityPair("C", "C"), EntityPair("D", "D")) - val testCols = Set("1", "2", "3") - - // Make sure that test passes for different random initialiaztions - for (seed <- 0 until 10) { - val random = new Random(seed) - val (mtrain, mdev, mtest) = m.randomTestSplit(2,3,Some(testRows), Some(testCols), random) - // Cell 2,2 is not elegible, so there are only 2 cells left for test set - assertEquals(1.0, mtrain.get(EntityPair("C", "C"), "2"), eps) - assertEquals(0, mtest.get(EntityPair("C", "C"), "2"), eps) - assertEquals(0, mdev.get(EntityPair("C", "C"), "2"), eps) - assertEquals(2,mdev.nnz()) - assertEquals(2,mtest.nnz()) - assertEquals(5,mtrain.nnz()) - // the 3 matrices are a partitoning of m: - // 1. their size is 2+2+5 = 9 - // 2. they contain all elements - //assertEquals(m.getNnzCells().toSet, mtrain.getNnzCells().toSet ++ mtest.getNnzCells().toSet ++ mdev.getNnzCells().toSet) - } - } - -} diff --git a/src/test/scala/cc/factorie/app/uschema/TestMatrixIndexMap.scala b/src/test/scala/cc/factorie/app/uschema/TestMatrixIndexMap.scala deleted file mode 100644 index a6aefbd..0000000 --- a/src/test/scala/cc/factorie/app/uschema/TestMatrixIndexMap.scala +++ /dev/null @@ -1,73 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.app.uschema - -import org.scalatest.junit.JUnitSuite -import cc.factorie.util -import org.junit.Test -import org.junit.Assert._ -import com.mongodb.DB -import com.github.fakemongo.Fongo -import scala.Some - -/** - * Created by beroth on 3/9/15. - */ -class TestMatrixIndexMap extends JUnitSuite with util.FastLogging { - @Test def readWriteMongoStringMapTest() { - val smap = new StringMemoryIndexMap(collectionPrefix = MongoWritable.ENTITY_ROW_MAP_PREFIX) - smap.add("b") - smap.add("C") - smap.add("d") - smap.add("a") - smap.add("b") - smap.add("A") - val fongo = new Fongo("myserver") - val db : DB = fongo.getDB("mydb") - - smap.writeToMongo(db) - - val smap2 = new StringMemoryIndexMap(collectionPrefix = MongoWritable.ENTITY_ROW_MAP_PREFIX) - smap2.populateFromMongo(db) - - assertEquals(smap.size, smap2.size) - - for(i <- 0 until smap.size) { - assertEquals(smap.indexToKey(i), smap2.indexToKey(i)) - } - } - - - - @Test def readWriteMongoEntityPairMapTest() { - val emap = new EntityPairMemoryMap(collectionPrefix = MongoWritable.ENTITY_ROW_MAP_PREFIX) - emap.add(new EntityPair("a","b")) - emap.add(new EntityPair("A","B")) - emap.add(new EntityPair("c","b")) - emap.add(new EntityPair("C","D")) - emap.add(new EntityPair("a","b")) - - val fongo = new Fongo("myserver") - val db : DB = fongo.getDB("mydb2") - - emap.writeToMongo(db) - val emap2 = new EntityPairMemoryMap(collectionPrefix = MongoWritable.ENTITY_ROW_MAP_PREFIX) - emap2.populateFromMongo(db) - - assertEquals(emap.size, emap2.size) - - for(i <- 0 until emap.size) { - assertEquals(emap.indexToKey(i), emap2.indexToKey(i)) - } - } -} diff --git a/src/test/scala/cc/factorie/app/uschema/TestUniversalSchemaTrainer.scala b/src/test/scala/cc/factorie/app/uschema/TestUniversalSchemaTrainer.scala deleted file mode 100644 index a57ee76..0000000 --- a/src/test/scala/cc/factorie/app/uschema/TestUniversalSchemaTrainer.scala +++ /dev/null @@ -1,153 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.app.uschema - -import org.scalatest.junit.JUnitSuite -import cc.factorie.util -import org.junit.Test -import scala.util.Random -import org.junit.Assert._ -import scala.Some - -/** - * Created by beroth on 2/20/15. - */ -class TestUniversalSchemaTrainer extends JUnitSuite with util.FastLogging { - - @Test def testSplitRandomizedTest() { - val numRows = 1000 - val numCols = 10000 - val nnz = 100000 - - val numTopics = 100 - val noise1 = 0.1 - - // Test whether objective function goes up - for (seed <- 0 until 2) { - val random = new Random(seed) - val m = CoocMatrix.randomOneZeroMatrix(numRows, numCols, nnz, random, numTopics, noise1).prune(1,1)._1 - println("nnz: " + m.nnz()) - - val stepsize = 0.1 - val regularizer = 0.01 - val dim = 10 - val iters = 10 - - val model = UniversalSchemaModel.randomModel(numRows, numCols, dim, random) - val trainer = new RegularizedBprUniversalSchemaTrainer(regularizer, stepsize, dim, m, model, random) - val objectiveValues = trainer.train(iters) - assertTrue(objectiveValues(0) < objectiveValues(9)) - assertTrue(objectiveValues(0) < objectiveValues(4)) - assertTrue(objectiveValues(4) < objectiveValues(9)) - } - - val numDevNNZ = 0 - val numTestNNZ = 150 - - for (seed <- 0 until 2) { - val random = new Random(seed) - val m = CoocMatrix.randomOneZeroMatrix(numRows, numCols, nnz, random, numTopics, noise1).prune(1,1)._1 - println("nnz: " + m.nnz()) - val (mTrain,mDev,mTest) = m.randomTestSplit(numDevNNZ, numTestNNZ, None, Some(Set(0,1,2,3,4,5,6,7,8,9)), random) - - val stepsize = 0.1 - val regularizer = 0.01 - val dim = 10 - - // Train model for different number of iterations - val model0 = UniversalSchemaModel.randomModel(numRows, numCols, dim, random) - val model5 = UniversalSchemaModel.randomModel(numRows, numCols, dim, random) - val trainer5 = new RegularizedBprUniversalSchemaTrainer(regularizer, stepsize, dim, mTrain, model5, random) - trainer5.train(5) - println("--") - val model10 = UniversalSchemaModel.randomModel(numRows, numCols, dim, random) - val trainer10 = new RegularizedBprUniversalSchemaTrainer(regularizer, stepsize, dim, mTrain, model10, random) - trainer10.train(10) - - val result0 = model0.similaritiesAndLabels(mTrain, mTest) - val result5 = model5.similaritiesAndLabels(mTrain, mTest) - val result10 = model10.similaritiesAndLabels(mTrain, mTest) - - println("0 iters map: " + Evaluator.meanAveragePrecision(result0)) - println("5 iters map: " + Evaluator.meanAveragePrecision(result5)) - println("10 iters map: " + Evaluator.meanAveragePrecision(result10)) - - assertTrue(Evaluator.meanAveragePrecision(result5) > Evaluator.meanAveragePrecision(result0)) - assertTrue(Evaluator.meanAveragePrecision(result10) > Evaluator.meanAveragePrecision(result5)) - } - } - - @Test def normConstrainedTrainingTest() { - println("Norm constrained BPR:") - val numRows = 1000 - val numCols = 10000 - val nnz = 100000 - - val numTopics = 100 - val noise1 = 0.1 - - val maxNorm = 1 - - // Test whether objective function goes up - for (seed <- 0 until 2) { - val random = new Random(seed) - val m = CoocMatrix.randomOneZeroMatrix(numRows, numCols, nnz, random, numTopics, noise1).prune(1,1)._1 - println("nnz: " + m.nnz()) - - val stepsize = 0.1 - val dim = 10 - val iters = 10 - - val model = UniversalSchemaModel.randomModel(numRows, numCols, dim, random) - val trainer = new NormConstrainedBprUniversalSchemaTrainer(maxNorm, stepsize, dim, m, model, random) - val objectiveValues = trainer.train(iters) - assertTrue(objectiveValues(0) < objectiveValues(9)) - assertTrue(objectiveValues(0) < objectiveValues(4)) - assertTrue(objectiveValues(4) < objectiveValues(9)) - } - - val numDevNNZ = 0 - val numTestNNZ = 150 - - for (seed <- 0 until 2) { - val random = new Random(seed) - val m = CoocMatrix.randomOneZeroMatrix(numRows, numCols, nnz, random, numTopics, noise1).prune(1,1)._1 - println("nnz: " + m.nnz()) - val (mTrain,mDev,mTest) = m.randomTestSplit(numDevNNZ, numTestNNZ, None, Some(Set(0,1,2,3,4,5,6,7,8,9)), random) - - val stepsize = 0.1 - val dim = 10 - - // Train model for different number of iterations - val model0 = UniversalSchemaModel.randomModel(numRows, numCols, dim, random) - val model5 = UniversalSchemaModel.randomModel(numRows, numCols, dim, random) - val trainer5 = new NormConstrainedBprUniversalSchemaTrainer(maxNorm, stepsize, dim, mTrain, model5, random) - trainer5.train(5) - println("--") - val model10 = UniversalSchemaModel.randomModel(numRows, numCols, dim, random) - val trainer10 = new NormConstrainedBprUniversalSchemaTrainer(maxNorm, stepsize, dim, mTrain, model10, random) - trainer10.train(10) - - val result0 = model0.similaritiesAndLabels(mTrain, mTest) - val result5 = model5.similaritiesAndLabels(mTrain, mTest) - val result10 = model10.similaritiesAndLabels(mTrain, mTest) - - println("0 iters map: " + Evaluator.meanAveragePrecision(result0)) - println("5 iters map: " + Evaluator.meanAveragePrecision(result5)) - println("10 iters map: " + Evaluator.meanAveragePrecision(result10)) - - assertTrue(Evaluator.meanAveragePrecision(result5) > Evaluator.meanAveragePrecision(result0)) - assertTrue(Evaluator.meanAveragePrecision(result10) > Evaluator.meanAveragePrecision(result5)) - } - } -} diff --git a/src/test/scala/cc/factorie/directed/TestBeta.scala b/src/test/scala/cc/factorie/directed/TestBeta.scala deleted file mode 100644 index d97f3f2..0000000 --- a/src/test/scala/cc/factorie/directed/TestBeta.scala +++ /dev/null @@ -1,36 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.directed - -import cc.factorie.util.FastLogging -import cc.factorie.variable.DoubleVariable -import org.junit.Assert._ -import org.junit.Test -import org.scalatest.junit.JUnitSuite - -class TestBeta extends JUnitSuite with FastLogging { - - @Test - def testBta(): Unit = { - val alpha = new DoubleVariable(1.0) - val beta = new DoubleVariable(3.0) - val value = new DoubleVariable(0.5) - - // mean = alpha / (alpha + beta) - assertEquals(0.25, Beta.mean(alpha.doubleValue, beta.doubleValue), 0.01) - - val f = Beta.newFactor(value, alpha, beta) - assert(f.pr(value.doubleValue, alpha.doubleValue, beta.doubleValue) == Beta.pr(value.doubleValue, alpha.doubleValue, beta.doubleValue)) - } - -} diff --git a/src/test/scala/cc/factorie/directed/TestDirectedModel.scala b/src/test/scala/cc/factorie/directed/TestDirectedModel.scala deleted file mode 100644 index d7450b1..0000000 --- a/src/test/scala/cc/factorie/directed/TestDirectedModel.scala +++ /dev/null @@ -1,39 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.directed - -import cc.factorie.util.FastLogging -import cc.factorie.variable.DoubleVariable -import org.junit.Test -import org.scalatest.junit.JUnitSuite - -class TestDirectedModel extends JUnitSuite with FastLogging { - - @Test - def testDirectedModel(): Unit = { - implicit val model = DirectedModel() - implicit val random = new scala.util.Random(0) - - assert(model.isInstanceOf[ItemizedDirectedModel]) - - val mean = new DoubleVariable(1) - val variance = new DoubleVariable(2.0) - - val data = for (i <- 1 to 10) yield new DoubleVariable :~ Gaussian(mean, variance) - - assert(model.factors(mean).size == 10) - assert(model.childFactors(mean).size == 10) - - } - -} diff --git a/src/test/scala/cc/factorie/directed/TestDirichlet.scala b/src/test/scala/cc/factorie/directed/TestDirichlet.scala deleted file mode 100644 index b742a08..0000000 --- a/src/test/scala/cc/factorie/directed/TestDirichlet.scala +++ /dev/null @@ -1,50 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.directed - -import cc.factorie.infer.Maximize -import cc.factorie.util.FastLogging -import cc.factorie.variable._ -import org.junit.Assert._ -import org.junit.Test -import org.scalatest.junit.JUnitSuite - -class TestDirichlet extends JUnitSuite with FastLogging { - - @Test - def testDirichlet(): Unit = { - implicit val random = new scala.util.Random(0) - object WordDomain extends EnumDomain { val a, b, c, d, e, f = Value } - class Word extends DiscreteVariable { def domain = WordDomain } - implicit val model = DirectedModel() - - val masses = new MassesVariable(new DenseMasses1(WordDomain.size, 2.0)) - assertArrayEquals(Array(2.0,2.0,2.0,2.0,2.0,2.0), masses.value.toArray, 0.01) - - // generate - val p1 = new ProportionsVariable(new DenseProportions1(WordDomain.size)) - p1 :~ Dirichlet(masses) - - val data = for (i <- 0 until 500) yield new Word :~ Discrete(p1) - - assert(model.parentFactor(p1).touches(masses)) - assert(model.childFactors(p1).size == 500) - - val s1 = MaximizeProportions.infer(Seq(p1), model) - val s2 = Maximize(Seq(p1), model) - -// val ps = for (i <- 0 until 1000) yield ProportionsVariable.dense(WordDomain.size) :~ Dirichlet(masses) -// MaximizeDirichletByMomentMatching(masses, model) - } - -} diff --git a/src/test/scala/cc/factorie/directed/TestDiscrete.scala b/src/test/scala/cc/factorie/directed/TestDiscrete.scala deleted file mode 100644 index 54d844c..0000000 --- a/src/test/scala/cc/factorie/directed/TestDiscrete.scala +++ /dev/null @@ -1,25 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.directed - -import org.junit.Test -import org.scalatest.junit._ - - -class TestDiscrete extends JUnitSuite with cc.factorie.util.FastLogging { - - @Test - def testMaximizeGeneratedDiscrete(): Unit = { - } - -} diff --git a/src/test/scala/cc/factorie/directed/TestFunction.scala b/src/test/scala/cc/factorie/directed/TestFunction.scala deleted file mode 100644 index 9fd4509..0000000 --- a/src/test/scala/cc/factorie/directed/TestFunction.scala +++ /dev/null @@ -1,33 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.directed - -import cc.factorie.variable.DoubleVariable -import org.junit.Test -import org.scalatest.junit._ - - -class TestFunction extends JUnitSuite with cc.factorie.util.FastLogging { - - @Test - def testDoubleSum(): Unit = { - implicit val model = DirectedModel() - implicit val random = new scala.util.Random(0) - - val a = new DoubleVariable(1.0) - val b = new DoubleVariable(2.0) -// val c = new DoubleSum(a, b) -// assertEquals(3.0, c.doubleValue, 0.001) - } - -} diff --git a/src/test/scala/cc/factorie/directed/TestGaussian.scala b/src/test/scala/cc/factorie/directed/TestGaussian.scala deleted file mode 100644 index 5fc420f..0000000 --- a/src/test/scala/cc/factorie/directed/TestGaussian.scala +++ /dev/null @@ -1,35 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.directed - -import cc.factorie.util.FastLogging -import cc.factorie.variable.DoubleVariable -import org.junit.Assert._ -import org.junit.Test -import org.scalatest.junit.JUnitSuite - -class TestGaussian extends JUnitSuite with FastLogging { - - @Test - def testGaussian(): Unit = { - val mean = new DoubleVariable(0.0) - val variance = new DoubleVariable(1.0) - val value = new DoubleVariable(2.0) - - val f = Gaussian.newFactor(value, mean, variance) - assert(f.pr == Gaussian.pr(value.doubleValue, mean.doubleValue, variance.doubleValue)) - assert(f.logpr == Gaussian.logpr(value.doubleValue, mean.doubleValue, variance.doubleValue)) - assertEquals(f.logpr, math.log(f.pr), 0.001) - } - -} diff --git a/src/test/scala/cc/factorie/directed/TestMaximizeProportions.scala b/src/test/scala/cc/factorie/directed/TestMaximizeProportions.scala deleted file mode 100644 index d218bba..0000000 --- a/src/test/scala/cc/factorie/directed/TestMaximizeProportions.scala +++ /dev/null @@ -1,48 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.directed - -import cc.factorie.variable._ -import org.junit.Assert._ -import org.junit.Test -import org.scalatest.junit._ - - -class TestMaximizeProportions extends JUnitSuite with cc.factorie.util.FastLogging { - - @Test - def testInfer(): Unit = { - implicit val model = DirectedModel() - - object DiceDomain extends EnumDomain { val ONE, TWO, THREE, FOUR, FIVE, SIX = Value } - class DiceSample(value:Int) extends DiscreteVariable(value) { def domain = DiceDomain } - - // this is our dice - val p = new ProportionsVariable(new DenseProportions1(DiceDomain.size)) - - // suppose we have an even dice - // now generate some samples for this proportion - new DiceSample(DiceDomain.ONE) ~ Discrete(p) - new DiceSample(DiceDomain.TWO) ~ Discrete(p) - new DiceSample(DiceDomain.THREE) ~ Discrete(p) - new DiceSample(DiceDomain.FOUR) ~ Discrete(p) - new DiceSample(DiceDomain.FIVE) ~ Discrete(p) - new DiceSample(DiceDomain.SIX) ~ Discrete(p) - - // we can estimate the proportion from the sample data - val s = MaximizeProportions.infer(Seq(p), model) - - val prop: Double = 1.0/6 - assertArrayEquals(Array(prop, prop, prop, prop, prop, prop), s.marginal(p)._1.value.asInstanceOf[DenseProportions1].toArray, 0.1) - } -} diff --git a/src/test/scala/cc/factorie/directed/TestPlatedDiscrete.scala b/src/test/scala/cc/factorie/directed/TestPlatedDiscrete.scala deleted file mode 100644 index 0c211c4..0000000 --- a/src/test/scala/cc/factorie/directed/TestPlatedDiscrete.scala +++ /dev/null @@ -1,44 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.directed - -import cc.factorie.util.FastLogging -import cc.factorie.variable._ -import org.junit.Assert._ -import org.junit.Test -import org.scalatest.junit.JUnitSuite - -class TestPlatedDiscrete extends JUnitSuite with FastLogging { - - // Support we have a bunch of coins, we flip each coin and check the results - object CoinDomain extends DiscreteDomain(2) - object CoinSeqDomain extends DiscreteSeqDomain { def elementDomain = CoinDomain } - class CoinSeq(num:Int) extends DiscreteSeqVariable(num) { def domain = CoinSeqDomain } - - @Test - def testPlatedDiscrete(): Unit = { - // 0 is tail, 1 is head - // all coins have p(tail) = 0.6, p(head) = 0.4 - val p = new ProportionsVariable(new DenseProportions1(Array(0.6, 0.4))) - - // construct the directed model and flip coins from the given distribution - implicit val model = DirectedModel() - implicit val random = new scala.util.Random(0) - val cs = new CoinSeq(1000) :~ PlatedDiscrete(p) - - // check the generated sequence - val numTails = cs.intValues.filter(_ == 0).length - assertEquals(0.6, numTails.toDouble/1000, 0.01) - } - -} diff --git a/src/test/scala/cc/factorie/directed/TestPoisson.scala b/src/test/scala/cc/factorie/directed/TestPoisson.scala deleted file mode 100644 index 7c907c2..0000000 --- a/src/test/scala/cc/factorie/directed/TestPoisson.scala +++ /dev/null @@ -1,33 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.directed - -import cc.factorie.maths -import cc.factorie.util.FastLogging -import cc.factorie.variable.{DoubleVariable, IntegerVariable} -import org.junit.Assert._ -import org.junit.Test -import org.scalatest.junit.JUnitSuite - -class TestPoisson extends JUnitSuite with FastLogging { - - @Test - def testPoisson(): Unit = { - val mean = new DoubleVariable(1.0) - val k = new IntegerVariable(2) - - val f = Poisson.newFactor(k, mean) - assertEquals(math.pow(mean.doubleValue, k.intValue) * math.exp(-mean.doubleValue) / maths.factorial(k.intValue), f.pr(k.intValue, mean.doubleValue), 0.001) - } - -} diff --git a/src/test/scala/cc/factorie/infer/TestBP.scala b/src/test/scala/cc/factorie/infer/TestBP.scala deleted file mode 100644 index f687c28..0000000 --- a/src/test/scala/cc/factorie/infer/TestBP.scala +++ /dev/null @@ -1,631 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.infer - -import cc.factorie._ -import cc.factorie.app.chain.ChainModel -import cc.factorie.app.nlp.Token -import cc.factorie.model.{Factor, Parameters, _} -import cc.factorie.optimize.LikelihoodExample -import cc.factorie.variable.{DiscreteDomain, DiscreteVariable, _} -import org.junit.Assert._ -import org.junit.Test - -import scala.collection.mutable.ArrayBuffer -import scala.util.Random - -/** - * Test for the factorie-1.0 BP framework (that uses WeightsMap) - * @author sameer, brian - * @since Aug 7, 2012 - */ -class TestBP extends util.FastLogging { - - import cc.factorie.infer.BPTestUtils._ - - val eps = 1e-4 - - @Test def v1f1Test() { - // one variable, one factor - val v = new BinVar(0) - val model = new ItemizedModel(newFactor1(v, 1, 1)) - val fg = BPSummary(Set(v), model) - assert(fg.bpFactors.size == 1) - assert(fg.bpVariables.size == 1) - BP.inferLoopy(fg, 1) - logger.debug(fg.marginal(v).proportions) - assertEquals(0.5, fg.marginal(v).proportions(0), eps) - } - - @Test def v1f1UnequalPotentialsSum() { - // one variable, one factor - val v = new BinVar(0) - val model = new ItemizedModel(newFactor1(v, 2, 1)) - val fg = BPSummary(Set(v), model) - assert(fg.bpFactors.size == 1) - assert(fg.bpVariables.size == 1) - BP.inferLoopy(fg, 1) - //logger.debug(fg.marginal(v).proportions) - assertEquals(e(2) / (e(2) + e(1)), fg.marginal(v).proportions(0), eps) - } - - @Test def v1f2Test1() { - //f1 = {0: 2, 1: 1}, f2 = {0: 1, 1: 2}") { - // one variable, two factors - val v = new BinVar(0) - val model = new ItemizedModel(newFactor1(v, 1, 2), newFactor1(v, 2, 1)) - val fg = BPSummary(Set(v), model) - assert(fg.bpFactors.size == 2) - assert(fg.bpVariables.size == 1) - BP.inferLoopy(fg, 1) - //logger.debug(fg.marginal(v).proportions) - assertEquals(0.5, fg.marginal(v).proportions(0), eps) - } - - @Test def v1f2Test2() { - // f1 = {0: 0, 1: 1}, f2 = {0: 0, 1: 1}") { - // one variable, two factors - val v = new BinVar(0) - val model = new ItemizedModel(newFactor1(v, 0, 1), newFactor1(v, 0, 1)) - val fg = BPSummary(Set(v), model) - assert(fg.bpFactors.size == 2) - assert(fg.bpVariables.size == 1) - BP.inferLoopy(fg, 1) - //logger.debug(fg.marginal(v).proportions) - assertEquals(fg.marginal(v).proportions(0), e(0) / (e(0) + e(2)), eps) - } - - @Test def v1f2MAP1() { - // f1 = {0: 2, 1: 1}, f2 = {0: 1, 1: 2}") { - // one variable, two factors - val v = new BinVar(0) - val model = new ItemizedModel(newFactor1(v, 1, 2), newFactor1(v, 2, 1)) - val fg = BPSummary(Set(v), BPMaxProductRing, model) - BP.inferLoopy(fg, 2) - //logger.debug(fg.marginal(v).proportions) - assertEquals(fg.marginal(v).proportions.maxIndex, 0) - } - - @Test def v1f2MAP2() { - // f1 = {0: 0, 1: 1}, f2 = {0: 0, 1: 1}") { - // one variable, two factors - val v = new BinVar(0) - val model = new ItemizedModel(newFactor1(v, 0, 1), newFactor1(v, 0, 1)) - val fg = BPSummary(Set(v), BPMaxProductRing, model) - BP.inferLoopy(fg, 1) - //logger.debug(fg.marginal(v).proportions) - assertEquals(fg.marginal(v).proportions.maxIndex, 1) - } - - @Test def v1f2ChainLogZ() { - // f1 = {0: 0, 1: 1}, f2 = {0: 0, 1: 1}") { - // one variable, two factors - val v = new BinVar(0) - val model = new ItemizedModel(newFactor1(v, 0.5, 1.3), newFactor1(v, -0.3, 4.0)) - val s = BP.inferChainMax(Seq(v), model) - val s2 = BP.inferChainSum(Seq(v), model) - // make sure all factors have the same logz - //val szs = s.bpFactors.to[Vector].map(_.calculateLogZ) - val s2zs = s2.bpFactors.to[Vector].map(_.calculateLogZ) - // assert(szs.distinct.length == 1) - assert(s2zs.distinct.length == 1) - } - - @Test def v2f1VaryingBoth() { - logger.debug("V2F1: varying both") - // a sequence of two variables, one factor - val v1 = new BinVar(1) - val v2 = new BinVar(0) - - // create template between v1 and v2 - val model = newTemplate2(v1, v2, 10, 0) - val vars: Set[DiscreteVar] = Set(v1, v2) - - val f = model.factors(v1).head - logger.debug("f score unequal: " + f.currentScore) - v2 := 1 - logger.debug("f score equal: " + f.currentScore) - - - logger.debug(newTemplate2(v1, v2, 10.0, 0.0).neighborDomain2) - logger.debug(model.asInstanceOf[FamilyWithNeighborDomains].neighborDomains) - - // vary both variables - val fg = BPSummary(vars, model) - assert(fg.bpFactors.size == 1) - assert(fg.bpVariables.size == 2) - BP.inferLoopy(fg, 5) - logger.debug("v1 : " + fg.marginal(v1).proportions) - logger.debug("v2 : " + fg.marginal(v2).proportions) - assertEquals(0.5, fg.marginal(v1).proportions(0), eps) - assertEquals(0.5, fg.marginal(v2).proportions(0), eps) - - assertEquals(math.log(2*math.exp(10) + 2*math.exp(0)), fg.logZ, 0.001) - - val fg2 = BP.inferChainSum(Seq(v1, v2), model) - assertEquals(math.log(2*math.exp(10) + 2*math.exp(0)), fg2.logZ, 0.001) - - val fg3 = BP.inferTreeSum(Seq(v1, v2).toSet, model) - assertEquals(math.log(2*math.exp(10) + 2*math.exp(0)), fg3.logZ, 0.001) - } - - @Test def v2f2VaryingBoth() { - logger.debug("V2F1: varying both") - // a sequence of two variables, one factor - val v1 = new BinVar(1) - val v2 = new BinVar(0) - - // create template between v1 and v2 - val model = new ItemizedModel(newFactor2(v1, v2, 10, 0), newFactor1(v1, 2, 1)) - val vars: Set[DiscreteVar] = Set(v1, v2) - - val logZ = math.log( - math.exp(10 + 2) // 0 0 - + math.exp(0 + 2)// 0 1 - + math.exp(0 + 1)// 1 0 - + math.exp(10 + 1)// 1 1 - ) - - val fg2 = BP.inferChainSum(Seq(v1, v2), model) - assertEquals(logZ, fg2.logZ, 0.001) - - val fg3 = BP.inferTreeSum(Seq(v1, v2).toSet, model) - assertEquals(logZ, fg3.logZ, 0.001) - - } - - @Test def testLoopyLogZ() { - val random = new scala.util.Random(0) - object cdomain extends CategoricalVectorDomain[String]() - val features = new BinaryFeatureVectorVariable[String]() { def domain = cdomain } - features += "asd" - val ldomain = new CategoricalDomain[String]() - val d = new app.nlp.Document("noname") - val t0 = new Token(d, 0, 1) - val t1 = new Token(d, 0, 1) - val t2 = new Token(d, 0, 1) - val t3 = new Token(d, 0, 1) - class Label(t: String) extends LabeledCategoricalVariable[String](t) { def domain = ldomain} - val l0 = new Label("1") - val l1 = new Label("0") - val l2 = new Label("2") - val l3 = new Label("3") - val lToT = Map(l0 -> t0, l1 -> t1, l2 -> t2, l3 -> t3) - val tToL = Map(t0 -> l0, t1 -> l1, t2 -> l2, t3 -> l3) - val model = new ChainModel[Label, BinaryFeatureVectorVariable[String], Token](ldomain, cdomain, l => features, lToT, tToL) - model.parameters.tensors.foreach(t => t.foreachElement((i, v) => t(i) += random.nextDouble())) - val trueLogZ = InferByBPChain.infer(Seq(l0, l1, l2, l3), model).logZ - val loopyLogZ = InferByBPLoopyTreewise.infer(Seq(l0, l1, l2, l3), model).logZ - assertEquals(trueLogZ, loopyLogZ, 0.01) - - val ex = new model.ChainLikelihoodExample(Seq(l0, l1, l2, l3)) - assert(optimize.Example.testGradient(model.parameters, model.parameters.keys, ex)) - val ex2 = new LikelihoodExample(Seq(l0, l1, l2, l3), model, InferByBPChain) - assert(optimize.Example.testGradient(model.parameters, model.parameters.keys, ex2)) - val ex3 = new LikelihoodExample(Seq(l0, l1, l2, l3), model, InferByBPTree) - assert(optimize.Example.testGradient(model.parameters, model.parameters.keys, ex3)) - - val fastSum = model.inferFast(Seq(l0, l1, l2, l3)) - val sum = InferByBPChain.infer(Seq(l0, l1, l2, l3), model) - assertEquals(sum.logZ, fastSum.logZ, 0.001) - for (label <- Seq(l0, l1, l2, l3)) { - // assertArrayEquals(sum.marginal(label).proportions.toArray, fastSum.marginal(label).asInstanceOf[DiscreteMarginal1[DiscreteVar]].proportions.toArray, 0.001) - } - for (factor <- sum.factors.get) { - // assertArrayEquals(sum.marginal(factor).tensorStatistics.toArray, fastSum.marginal(factor).tensorStatistics.toArray, 0.001) - } - - val meanFieldSummary = InferByMeanField.apply(Seq(l0, l1, l2, l3), model) - val BPSummary = InferByBPChain(Seq(l0, l1, l2, l3), model) - for (v <- meanFieldSummary.variables) { - val mfm = meanFieldSummary.marginal(v) - val bpm = BPSummary.marginal(v) - for (i <- 0 until mfm.proportions.length) { - assertEquals(mfm.proportions(i), bpm.proportions(i), 0.1) - } - } - - // Testing MPLP - val mplpSummary = MaximizeByMPLP.infer(Seq(l0, l1, l2, l3), model) - val mapSummary = MaximizeByBPChain.infer(Seq(l0, l1, l2, l3), model) - for (v <- Seq(l0, l1, l2, l3)) { - val mfm = mplpSummary.mapAssignment(v) - val bpm = mapSummary.mapAssignment(v) - assertEquals(bpm.intValue, mfm.intValue) - } - - // testing dual decomposition -// val model0 = DualDecomposition.getBPInferChain(Seq(l0, l1, l2), model) -// val model1 = DualDecomposition.getBPInferChain(Seq(l2, l3), model) -// val ddSummary = InferByDualDecomposition.infer(Seq(model0, model1), Seq((0, l2, 1, l2))) -// for (v <- Seq(l0, l1, l2, l3)) { -// val mfm = ddSummary.mapAssignment(v) -// val bpm = mapSummary.mapAssignment(v) -// assertEquals(bpm.intValue, mfm.intValue) -// } - - val samplingSummary = InferByGibbsSampling.infer(Seq(l0, l1, l2, l3), model) - for ((variable, marginal) <- samplingSummary.variableMap) { - variable.value - marginal.proportions - } - } - - @Test def v2f1VaryingOne() { - logger.debug("V2F1: varying one") - // a sequence of two variables, one factor - val v1 = new BinVar(1) - val v2 = new BinVar(0) - - // create template between v1 and v2 - val model = newTemplate2(v1, v2, -10, 0) - val vars: Set[Var] = Set(v1, v2) - val varying = Set(v1) - - val fg = BPSummary(varying, model) - assert(fg.bpFactors.size == 1) - assert(fg.bpVariables.size == 1) - BP.inferLoopy(fg, 5) - logger.debug("v1 : " + fg.marginal(v1).proportions) - - val v1Marginal = fg.marginal(v1).proportions - for ((_, i) <- v1.settings.zipWithIndex if v1.value == v2.value) - assertEquals(v1Marginal(i), 0.0, eps) - - } - - @Test def loop2() { - val v1 = new BinVar(1) - val v2 = new BinVar(0) - val vars: Set[DiscreteVar] = Set(v1, v2) - - val model = new ItemizedModel( - // bias - newFactor1(v1, 1, 0), - newFactor1(v2, 1, 0), - // loop - newFactor2(v1, v2, 1, 0), - newFactor2(v1, v2, 3, -1) - ) - - var fg = BPSummary(vars, model) - BP.inferLoopy(fg, 1) - logger.debug("v1 : " + fg.marginal(v1).proportions) - logger.debug("v2 : " + fg.marginal(v2).proportions) - - fg.setToMaximize(null) - - logger.debug("v1 val : " + v1.value) - logger.debug("v2 val : " + v2.value) - assert(v1.intValue == 0) - assert(v2.intValue == 0) - } - - @Test def loop4() { - logger.debug("Loop4") - val v1 = new BinVar(1) - val v2 = new BinVar(0) - val v3 = new BinVar(1) - val v4 = new BinVar(0) - val vars: Set[DiscreteVar] = Set(v1, v2, v3, v4) - - val model = new ItemizedModel( - // loop of repulsion factors, with v4 having an extra factor - // pegging its value to 0 - newFactor2(v1, v2, -5, 0), - newFactor2(v2, v3, -5, 0), - newFactor2(v3, v4, -5, 0), - newFactor2(v4, v1, -5, 0), - // bias - newFactor1(v4, 10, 0) - ) - - val fg = BPSummary(vars, model) - BP.inferLoopy(fg, 4) - fg.setToMaximize() - - assertEquals(fg.marginal(v1).proportions(0), 0.0, 0.1) - assertEquals(fg.marginal(v1).proportions(1), 1.0, 0.1) - assertEquals(fg.marginal(v2).proportions(0), 1.0, 0.1) - assertEquals(fg.marginal(v2).proportions(1), 0.0, 0.1) - assertEquals(fg.marginal(v3).proportions(0), 0.0, 0.1) - assertEquals(fg.marginal(v3).proportions(1), 1.0, 0.1) - assertEquals(fg.marginal(v4).proportions(0), 1.0, 0.1) - assertEquals(fg.marginal(v4).proportions(1), 0.0, 0.1) - - assertEquals(v1.intValue, 1) - assertEquals(v2.intValue, 0) - assertEquals(v3.intValue, 1) - assertEquals(v4.intValue, 0) - } - - @Test def chainRandom() { - logger.debug("ChainRandom") - val numVars = 2 - val vars: Seq[BinVar] = (0 until numVars).map(new BinVar(_)).toSeq - val varSet = vars.toSet[DiscreteVar] - for (seed <- 0 until 50) { - val random = new Random(seed * 1024) - val model = new ItemizedModel - for (i <- 0 until numVars) { - model += newFactor1(vars(i), 0, random.nextDouble() * 4.0 - 2.0) - if ((i + 1) != numVars) model += newFactor2(vars(i), vars(i + 1), 0, random.nextDouble() * 6.0 - 3.0) - } - // true marginals and the map - val marginals: Array[Double] = Array.fill(numVars)(0.0) - - // go through all the configurations - var Z = 0.0 - val scores = new ArrayBuffer[Double] - var maxScore = Double.NegativeInfinity - var mapAssignment: Int = -1 - for (bs <- 0 until math.pow(2, numVars).toInt) { - for (i <- 0 until numVars) { - vars(i).set((bs / math.pow(2, i)).toInt % 2)(null) - } - val score = model.currentScore(vars.toIterable) - scores += score - Z += math.exp(score) - for (i <- 0 until numVars) { - if (vars(i).intValue == 0) { - marginals(i) += math.exp(score) - } - } - if (score > maxScore) { - maxScore = score - mapAssignment = bs - } - } - logger.debug("map : " + mapAssignment) - logger.debug("marginals : " + marginals.map(_ / Z).mkString(", ")) - - // test sum-product - val fg = BP.inferChainSum(vars, model) - for (i <- 0 until numVars) { - logger.debug("v" + i + " : " + fg.marginal(vars(i)).proportions) - assertEquals(marginals(i) / Z, fg.marginal(vars(i)).proportions(0), eps) - } - - assertEquals(fg.bpFactors.head.calculateLogZ, fg.bpFactors.last.calculateLogZ, 0.1) - - // TODO: add back logZ assertion - //logger.debug("z : " + math.log(Z) + ", " + fg.logZ()) - //assertEquals(math.log(Z), fg.logZ(), eps) - // max product - - val mfg = BP.inferChainMax(vars, model) - val mfg2 = BP.inferTreeMarginalMax(vars, model) - assertEquals(mfg.logZ, mfg2.logZ, 0.001) - for (v <- vars) { - assertEquals(mfg.mapAssignment(v).intValue, mfg2.mapAssignment(v).intValue) - } - mfg.setToMaximize(null) - logger.debug("probabilities : " + scores.map(math.exp(_) / Z).mkString(", ")) - for (i <- 0 until numVars) { - // logger.debug("v" + i + " : " + mfg.marginal(vars(i)).proportions) - logger.debug("tv" + i + " : " + (mapAssignment / math.pow(2, i)).toInt % 2) - assertEquals(vars(i).value.intValue, (mapAssignment / math.pow(2, i)).toInt % 2) - } - } - } - - // This is a test for the naive sparsity for BP in Factor3 messing up results on a graph like this: - // Ascii diagram: - // - // featureVariable <-- (constant) - // | - // firstVar ---+--- secondVar - - @Test def defaultSparsity() { - - // Create the variables we'll do inference over - - val firstVar = new BooleanVariable() - val secondVar = new BooleanVariable() - - // Create a feature variable (constant during inference), with string features - - val featureDomain: CategoricalDomain[String] = new CategoricalDomain[String] - val featuresVar = new FeatureVectorVariable[String]() { - override def domain: CategoricalVectorDomain[String] = new CategoricalVectorDomain[String] { - override def dimensionDomain = featureDomain - } - } - - // put a single feature in the variable - // "feat1" -> 1.0 - - featuresVar.update(featureDomain.index("feat1"), 1.0)(null) - - val model = new Model with Parameters { - val errorModel = new DotFamilyWithStatistics3[BooleanVariable,BooleanVariable,FeatureVectorVariable[String]] { - val weights = Weights(new la.DenseTensor3(2, 2, 1)) - weights.value := Array(3.0, 1.0, 0.5, 0.5) - } - override def factors(variables: Iterable[Var]): Iterable[Factor] = { - List(errorModel.Factor(firstVar, secondVar, featuresVar)) - } - } - - // Do the inference over firstVar and secondVar using BP on a Tree - val sumExactBeliefs : Summary = BP.inferTreeSum(List(firstVar, secondVar), model) - // Get the marginals - val m1 : DiscreteMarginal1[BooleanVariable] = sumExactBeliefs.getMarginal(firstVar).get.asInstanceOf[DiscreteMarginal1[BooleanVariable]] - val m2 : DiscreteMarginal1[BooleanVariable] = sumExactBeliefs.getMarginal(secondVar).get.asInstanceOf[DiscreteMarginal1[BooleanVariable]] - -// println(m1.proportions) -// println(m2.proportions) - - assertEquals(0.8737, m1.proportions.toArray(0), 0.01) - assertEquals(0.1263, m1.proportions.toArray(1), 0.01) - assertEquals(0.8327, m2.proportions.toArray(0), 0.01) - assertEquals(0.1673, m2.proportions.toArray(1), 0.01) - } - - @Test def tree3() { - val v1 = new BinVar(0) - val v2 = new BinVar(1) - val v3 = new BinVar(0) - val vars: Set[DiscreteVar] = Set(v1, v2, v3) - // v1 -- v3 -- v2 - val model = new ItemizedModel( - newFactor1(v1, 3, 0), - newFactor1(v2, 0, 3), - newFactor2(v1, v3, 3, 0), - newFactor2(v3, v2, 3, 0) - ) - - val fg = BP.inferTreeSum(vars, model, root = v3) - fg.setToMaximize() - - logger.debug("v1 : " + fg.marginal(v1).proportions) - logger.debug("v2 : " + fg.marginal(v2).proportions) - logger.debug("v3 : " + fg.marginal(v3).proportions) - logger.debug("v1 val : " + v1.value) - logger.debug("v2 val : " + v2.value) - logger.debug("v3 val : " + v3.value) - - assertEquals(0.5, fg.marginal(v3).proportions(0), eps) - assertEquals(v1.intValue, 0) - assertEquals(v2.intValue, 1) - - var z = 0.0 - for (i <- Seq(0, 1); j <- Seq(0, 1); k <- Seq(0, 1)) { - v1.set(i)(null) - v2.set(j)(null) - v3.set(k)(null) - z += math.exp(model.currentScore(Seq(v1, v2, v3))) - } - - assertEquals(fg.logZ, math.log(z), 0.001) - - val fg2 = BP.inferChainSum(vars.toSeq, model) - assertEquals(fg2.logZ, math.log(z), 0.001) - - val vars2 = Seq(v1, v3) - var z2 = 0.0 - for (i <- Seq(0, 1); j <- Seq(0, 1)) { - v1.set(i)(null) - v3.set(j)(null) - z2 += math.exp(model.currentScore(vars2)) - } - - assertEquals(math.log(z2), BP.inferChainSum(vars2, model).logZ, 0.001) - assertEquals(math.log(z2), BP.inferTreeSum(vars2.toSet, model).logZ, 0.001) - } - - @Test def tree7() { - val v1 = new BinVar(0) { override def toString = "v1" } - val v2 = new BinVar(1) { override def toString = "v2" } - val v3 = new BinVar(0) { override def toString = "v3" } - val v4 = new BinVar(0) { override def toString = "v4" } - val v5 = new BinVar(0) { override def toString = "v5" } - val v6 = new BinVar(0) { override def toString = "v6" } - val v7 = new BinVar(0) { override def toString = "v7" } - val vars: Set[DiscreteVar] = Set(v1, v2, v3, v4, v5, v6, v7) - // v4 - // v3 v5 - // v1 v2 v6 v7 - val model = new ItemizedModel( - newFactor1(v1, 10, 0), //newFactor1(v7, 0, 3), - newFactor2(v1, v3, 5, 0), newFactor2(v2, v3, -5, 0), - newFactor2(v3, v4, 5, 0), newFactor2(v5, v4, -5, 0), - newFactor2(v6, v5, 5, 0), newFactor2(v7, v5, -5, 0) - ) - val fg = BP.inferTreeSum(vars, model, v4) - fg.setToMaximize() - - assert(fg.marginal(v7).proportions(0) > 0.95) - - logger.debug("v1 : " + fg.marginal(v1).proportions) - logger.debug("v2 : " + fg.marginal(v2).proportions) - logger.debug("v3 : " + fg.marginal(v3).proportions) - logger.debug("v4 : " + fg.marginal(v4).proportions) - logger.debug("v5 : " + fg.marginal(v5).proportions) - logger.debug("v6 : " + fg.marginal(v6).proportions) - logger.debug("v7 : " + fg.marginal(v7).proportions) - logger.debug(" %2d".format(v4.intValue)) - logger.debug(" %2d %2d".format(v3.intValue, v5.intValue)) - logger.debug("%2d %2d %2d %2d".format(v1.intValue, v2.intValue, v6.intValue, v7.intValue)) - - assert(v1.intValue == 0) - assert(v2.intValue == 1) - assert(v3.intValue == 0) - assert(v4.intValue == 0) - assert(v5.intValue == 1) - assert(v6.intValue == 1) - assert(v7.intValue == 0) - } - -} - -object BPTestUtils { - // a binary variable that takes values 0 or 1 - object BinDomain extends DiscreteDomain(2) - - class BinVar(i: Int) extends DiscreteVariable(i) { - def domain = BinDomain - } - - - def newFactor1(n1: BinVar, score0: Double, score1: Double): Factor = { - val family = new DotTemplateWithStatistics1[BinVar] with Parameters { - val weights = Weights(new la.DenseTensor1(BinDomain.size)) - } - assert(family.weights.value ne null) - family.weights.value(0) = score0 - family.weights.value(1) = score1 - n1.set(0)(null) - n1.set(1)(null) - family.factors(n1).head - } - - def newFactor2(n1: BinVar, n2: BinVar, scoreEqual: Double, scoreUnequal: Double): Factor = { - val family = new DotTemplate2[BinVar, BinVar] with Parameters { - override def neighborDomain1 = BinDomain - override def neighborDomain2 = BinDomain - val weights = Weights(new la.DenseTensor1(BinDomain.size)) - def unroll1(v: BinVar) = if (v == n1) Factor(n1, n2) else Nil - def unroll2(v: BinVar) = if (v == n2) Factor(n1, n2) else Nil - override def statistics(value1: BinVar#Value, value2: BinVar#Value) = - BinDomain(if (value1.intValue == value2.intValue) 0 else 1) - } - assert(!family.statisticsAreValues) - family.weights.value(0) = scoreEqual - family.weights.value(1) = scoreUnequal - family.factors(n1).head - } - - def newTemplate2(n1: BinVar, n2: BinVar, scoreEqual: Double, scoreUnequal: Double) = { - new TupleTemplateWithStatistics2[BinVar, BinVar] { - override def neighborDomain1 = BinDomain - override def neighborDomain2 = BinDomain - def unroll1(v: BPTestUtils.this.type#BinVar) = if (v == n1) Factor(n1, n2) else Nil - def unroll2(v: BPTestUtils.this.type#BinVar) = if (v == n2) Factor(n1, n2) else Nil - def score(v1:BinVar#Value, v2:BinVar#Value): Double = if (v1 == v2) scoreEqual else scoreUnequal - } - } - - import scala.language.existentials - def newFactor3(n1: BinVar, n2: BinVar, n3: BinVar, scores: Seq[Double]) = - new TupleFactorWithStatistics3[BinVar, BinVar, BinVar](n1, n2, n3) { - factor => - def score(v1:BinVar#Value, v2:BinVar#Value, v3:BinVar#Value): Double = scores(v1.intValue * 4 + v2.intValue * 2 + v3.intValue) - override def equalityPrerequisite = this - override def toString = "F(%s,%s,%s)".format(n1, n2, n3) - } - - // short for exponential - def e(num: Double) = math.exp(num) - -} diff --git a/src/test/scala/cc/factorie/la/TestTensor.scala b/src/test/scala/cc/factorie/la/TestTensor.scala deleted file mode 100644 index 491d662..0000000 --- a/src/test/scala/cc/factorie/la/TestTensor.scala +++ /dev/null @@ -1,217 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.la - -import cc.factorie._ -import cc.factorie.util.FastSorting -import org.junit.Assert._ -import org.junit._ - -import scala.languageFeature.postfixOps -import scala.util.Random - -class TestTensor extends cc.factorie.util.FastLogging { - - @Test def testSorting(): Unit = { - val arr1 = Array(1, 2, 5, 3, 9, 7, 8, 4) - val arr2 = Array(1, 2, 3, 4, 5, 6, 7, 8) - FastSorting.quickSort(arr1, arr2) - assert(arr2.sameElements(Array(1, 2, 4, 8, 3, 6, 7, 5))) - } - - @Test def runTest(): Unit = { - val dim = 20 - val ts = Seq(new DenseTensor1(dim), new SparseTensor1(dim)) - val r = new Random - for (i <- 0 until 10) { - val index = math.abs(r.nextInt()) % dim - val value = r.nextDouble() - logger.debug("index="+index+" value="+value) - ts.foreach(_.+=(index, value)) - } - //println(ts.head.toSeq) - //println(ts.last.toSeq) - for (i <- 0 until 20) assertEquals(ts.head(i), ts.last(i), 0.001) - // assert(ts.head.toSeq == ts.last.toSeq) - } - - @Test def testOuter(): Unit = { - val t1 = new SparseIndexedTensor1(10) - val t2 = new SparseIndexedTensor1(10) - val t3 = new SparseIndexedTensor1(10) - - t1 += (1, 2.0) - t2 += (2, 1.0) - t3 += (3, 4.0) - - println(t1 outer t2) - // the outer product is an instance of Tensor2 - assert((t1 outer t2).isInstanceOf[Outer1Tensor2]) - - assert(((t1 outer t2 outer t3) dot (t1 outer t2 outer t3)) == 64) - assert(((t1 outer (t2 outer t3)) dot (t1 outer (t2 outer t3))) == 64) - - t1 += (1, 3.0) - - val t4 = new SingletonBinaryTensor1(10, 2) - - val res = new SparseIndexedTensor2(10, 10) + (t4 outer t1) - - assert(res(21) == 5.0, 0.0001) - } - - @Test def testBinary(): Unit = { - val foo = new SparseBinaryTensor1(100) - foo += (50, 1.0) - foo += (60, 0.0) - foo(70) = 0.0 - foo(71) = 1.0 - - assertDoubleEquals(foo(50), 1, 0.01) - assertDoubleEquals(foo(60), 0, 0.01) - assertDoubleEquals(foo(70), 0, 0.01) - assertDoubleEquals(foo(71), 1, 0.01) - - val bar = new SparseBinaryTensor2(10, 10) - bar(5, 5) = 0.0 - bar (6, 6) = 1.0 - - assertDoubleEquals(bar(4,4), 0, 0.01) - assertDoubleEquals(bar(5,5), 0, 0.01) - assertDoubleEquals(bar(6,6), 1, 0.01) - } - - trait TensorCreator { - def create(i: Int): Tensor - } - - def creators: Seq[TensorCreator] = Seq( - new TensorCreator { def create(i: Int) = new DenseTensor1(i) }, - new TensorCreator { def create(i: Int) = new SparseTensor1(i) }, - new TensorCreator { def create(i: Int) = new GrowableSparseIndexedTensor1(Iterable.fill(i)(0)) }, - new TensorCreator { def create(i: Int) = new SparseIndexedTensor2(i, 1) }, - new TensorCreator { def create(i: Int) = new DenseTensor2(1, i) }, - new TensorCreator { def create(i: Int) = new DenseLayeredTensor2(1, i, new DenseTensor1(_)) }, - new TensorCreator { def create(i: Int) = new DenseLayeredTensor2(1, i, new SparseTensor1(_)) }, - new TensorCreator { def create(i: Int) = new DenseTensor3(1, 1, i) }, - new TensorCreator { def create(i: Int) = new Dense2LayeredTensor3(1, 1, i, new DenseTensor1(_)) }, - new TensorCreator { def create(i: Int) = new Dense2LayeredTensor3(1, 1, i, new SparseTensor1(_)) } - // TODO: add all other tensor types above here - ) - - @Test def testZero() { - def fill(t: TensorCreator) = { - val tensor = t.create(100) - tensor(10) = 20 - tensor(1) = 2 - tensor(2) = -5 - tensor - } - testPairwise(fill) { (t1, t2) => - t1 += t2 - t1.zero() - assert(t1.forall(0.0.==), "Failed zero check at %s, %s" format (t1.getClass.getName, t2.getClass)) - val t3 = t1.blankCopy - t3 += t1 - t3 += t2 -// assert((0 until t2.size).forall(i => t2(i) == t3(i)), "Failed += after zero() at %s, %s" format (t1.getClass, t2.getClass)) - } - } - - def testPairwise(fill: TensorCreator => Tensor)(test: (Tensor, Tensor) => Unit): Unit = { - for (c1 <- creators; c2 <- creators) { - val t1 = fill(c1) - val t2 = fill(c2) - if (!(t1.isInstanceOf[Tensor2] && t2.isInstanceOf[Tensor3]) && - !(t1.isInstanceOf[Tensor3] && t2.isInstanceOf[Tensor2])) - test(t1, t2) - } - } - - @Test def testDot() { - val dim1 = 10; val dim2 = 1000 - val random = new scala.util.Random(0) - val dense = new DenseTensor2(dim1, dim2) - val sparse = new DenseLayeredTensor2(dim1, dim2, new SparseIndexedTensor1(_)) - for (i <- 0 until 1000) dense(random.nextInt(dim1*dim2)) = random.nextDouble() - sparse += dense - val features = new SparseBinaryTensor1(dim2) - for (i <- 0 until 20) features.+=(random.nextInt(dim2)) - val statistics = new SingletonLayeredTensor2(dim1, dim2, 3, 0.3, features) - assertEquals(dense dot statistics, sparse dot statistics, 0.0001) - - val s1 = new SparseIndexedTensor1(10) - s1 += (1, 5.0) - s1 += (6, 2.0) - s1 += (8, -3.0) - val s2 = new SparseIndexedTensor1(10) - s2 += (1, -1.0) - s2 += (5, 2.0) - s2 += (8, 5.0) - s2 += (9, 5.0) - - assertEquals(s1 dot s2, -20.0, 0.0001) - } - - @Test def testPlusEqual() { - - def fill(t: TensorCreator) = { - val tensor = t.create(100) - tensor(10) = 20 - tensor(1) = 2 - tensor(2) = -5 - tensor - } - - testPairwise(fill) { (t1, t2) => - - logger.debug("testing " + t1.getClass.getName + " and " + t2.getClass.getName) - assertEquals(20.0*20 + 2*2 + 5*5, t1 dot t2, 0.001) - t1 += (t2, 0.1) - assertEquals(t1(10), 22, 0.01) - - t1 *= 0.5 - assertEquals(11, t1(10), 0.001) - - try { - assertEquals(22, (t1*2)(10), 0.001) - } catch { - case e: Error => assert(e.getMessage.contains("Method copy not defined")) - } - } - } - - // DenseTensor stores elements in an full-size array - @Test def testDenseTensor3(): Unit = { - val tensor = new DenseTensor3(3, 3, 3) - - // ensure numDimensions is correct - assertEquals(tensor.numDimensions, 3) - assertEquals(tensor.length, 27) - - // convert array index to tensor index - assertEquals(tensor.singleIndex(0, 0, 0), 0) - assertEquals(tensor.singleIndex(0, 0, 1), 1) - assertEquals(tensor.singleIndex(0, 0, 2), 2) - assertEquals(tensor.singleIndex(0, 1, 0), 3) - assertEquals(tensor.singleIndex(0, 1, 1), 4) - - // element value can be assigned by index - tensor(1,1,1) = 1.0 - assertEquals(tensor(1,1,1), 1.0, 0.001) - - // element value can also be assigned by += - tensor += (2, 2, 2, 2.0) - assertEquals(tensor(2,2,2), 2.0, 0.001) - } -} diff --git a/src/test/scala/cc/factorie/la/TestTensor2.scala b/src/test/scala/cc/factorie/la/TestTensor2.scala deleted file mode 100644 index dc71d18..0000000 --- a/src/test/scala/cc/factorie/la/TestTensor2.scala +++ /dev/null @@ -1,75 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.la - -import org.junit.Assert._ -import org.junit.Test -import org.scalatest.junit._ - - -class TestTensor2 extends JUnitSuite with cc.factorie.util.FastLogging { - - val eps = 1e-4 - - @Test - def testDenseTensor2(): Unit = { - val t1 = new DenseTensor2(2,2) - - // initial value is 0 - assertEquals(0.0, t1(0,0), eps) - assertEquals(0.0, t1(0,1), eps) - assertEquals(0.0, t1(1,0), eps) - assertEquals(0.0, t1(1,1), eps) - - // construct an matrix - // | 0.2 0.4 | - // | 0.8 0.6 | - t1(0,0) = 0.2 - t1(0,1) = 0.4 - t1(1,0) = 0.8 - t1(1,1) = 0.6 - - val t1equal = new DenseTensor2(Array(Array(0.2, 0.4), Array(0.8, 0.6))) - assertArrayEquals(t1.toArray, t1equal.toArray, eps) - - assertEquals(0.2, t1(0,0), eps) - - val t2 = new DenseTensor2(2,2) - // construct an matrix - // | 0.1 0.3 | - // | 0.9 0.7 | - t2(0,0) = 0.1 - t2(0,1) = 0.3 - t2(1,0) = 0.9 - t2(1,1) = 0.7 - - val t3 = new DenseTensor1(2) - t3(0) = 0.1 - t3(1) = 0.9 - - // | 0.2 0.4 | * | 0.1 | = | 0.38 | - // | 0.8 0.6 | | 0.9 | | 0.62 | - val t4 = t1 * t3 - assertArrayEquals(Array(0.38, 0.62), t4.toArray, eps) - - // | 0.2 0.4 | leftMultiply | 0.1 | = | 0.1 0.9 | * | 0.2 0.4 | = | 0.74 | - // | 0.8 0.6 | | 0.9 | | 0.8 0.6 | | 0.58 | - val t5 = t1 leftMultiply t3 - assertArrayEquals(Array(0.74, 0.58), t5.toArray, eps) - - // println(t1 outer t3) - // not fully implemented, which will cause infinite looping - // t1 outer t2 - - } -} diff --git a/src/test/scala/cc/factorie/maths/TestGamma.scala b/src/test/scala/cc/factorie/maths/TestGamma.scala deleted file mode 100644 index 26178b4..0000000 --- a/src/test/scala/cc/factorie/maths/TestGamma.scala +++ /dev/null @@ -1,26 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.maths - -import org.junit.Test - -class TestGamma { - @Test def runTest(): Unit = { - def factorial(n: Int): Int = if (n == 0) 1 else n * factorial(n - 1) - val xs = Seq[Double](1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11) - for (x <- xs) { -// println(f"gamma($x%f) old: ${factorial(x.toInt - 1).toDouble}%f new: ${math.exp(logGamma(x))}%f") - assert(math.abs(factorial(x.toInt - 1).toDouble) - math.exp(logGamma(x)) < .01) - } - } -} \ No newline at end of file diff --git a/src/test/scala/cc/factorie/model/TestModel.scala b/src/test/scala/cc/factorie/model/TestModel.scala deleted file mode 100644 index 4bb1011..0000000 --- a/src/test/scala/cc/factorie/model/TestModel.scala +++ /dev/null @@ -1,27 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.model - -import org.junit.Test -import org.scalatest.junit._ - - -class TestModel extends JUnitSuite with cc.factorie.util.FastLogging { - - @Test - def testItemizedModel() { - val m = new ItemizedModel() - assert(m.factors.size == 0) - } - -} diff --git a/src/test/scala/cc/factorie/model/TestProposalSamplers.scala b/src/test/scala/cc/factorie/model/TestProposalSamplers.scala deleted file mode 100644 index dcbb88c..0000000 --- a/src/test/scala/cc/factorie/model/TestProposalSamplers.scala +++ /dev/null @@ -1,93 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.model - -/** - * @author sameer - */ - -import cc.factorie.infer.VariablesSettingsSampler -import cc.factorie.variable.{CategoricalDomain, LabeledCategoricalVariable} -import junit.framework.Assert._ -import junit.framework._ - -/** - * @author sameer - * @since Sep 5, 2011 - */ - -class TestProposalSamplers extends TestCase with cc.factorie.util.FastLogging { - - val numLabels: Int = 3 - - // a binary variable that takes values 0 or 1 - object LabelDomain extends CategoricalDomain[Int](0 until numLabels) - - class BinVar(i: Int) extends LabeledCategoricalVariable(i) { - def domain = LabelDomain - } - - import scala.language.existentials - private def newFactor2(n1: BinVar, n2: BinVar, scoreEqual: Double, scoreUnequal: Double) = - new TupleFactorWithStatistics2[BinVar, BinVar](n1, n2) { - factor => - def score(s1:BinVar#Value, s2:BinVar#Value): Double = if (s1 == s2) scoreEqual else scoreUnequal - override def equalityPrerequisite = this - } - - // short for exponential - private def e(num: Double) = math.exp(num) - - val eps = 1e-5 - - override protected def setUp() { - super.setUp() - // initialize binary variables with two values - new BinVar(0) - new BinVar(1) - } - - def testV2F1() = { - implicit val random = new scala.util.Random(0) - val samples = 10000 - val v1 = new BinVar(0) - val v2 = new BinVar(0) - val model = new ItemizedModel(newFactor2(v1, v2, 5, 1)) - val sampler = new VariablesSettingsSampler[BinVar](model) - - val origScore = model.currentScore(Seq(v1, v2)) - logger.debug("orig score: " + origScore) - val assignCounts = Array.fill(numLabels, numLabels)(0) - for (i <- 0 until samples) { - sampler.process(Seq(v1, v2)) - assignCounts(v1.intValue)(v2.intValue) += 1 - } - val totalCount = assignCounts.toSeq.foldLeft(0.0)((s, arr) => arr.toSeq.foldLeft(s)(_ + _)) - var Z = 0.0 - for (p <- sampler.proposals(Seq(v1, v2))) { - p.diff.redo() - val modelScore = model.currentScore(Seq(v1, v2)) - Z += e(modelScore) - p.diff.undo() - } - for (p <- sampler.proposals(Seq(v1, v2))) { - p.diff.redo() - val modelScore = model.currentScore(Seq(v1, v2)) - val sampleProb = assignCounts(v1.intValue)(v2.intValue) / totalCount - logger.debug("%d %d : true: %f, prop: %f, trueProb: %f, sample: %f".format(v1.intValue, v2.intValue, modelScore - origScore, p.modelScore, e(modelScore) / Z, sampleProb)) - assertEquals(modelScore - origScore, p.modelScore, eps) - assertEquals(e(modelScore) / Z, sampleProb, 0.01) - p.diff.undo() - } - } -} diff --git a/src/test/scala/cc/factorie/model/TestTemplates.scala b/src/test/scala/cc/factorie/model/TestTemplates.scala deleted file mode 100644 index b443c53..0000000 --- a/src/test/scala/cc/factorie/model/TestTemplates.scala +++ /dev/null @@ -1,155 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.model - -import cc.factorie._ -import cc.factorie.variable.{BooleanVariable, DiffList, RealVariable, Var, _} -import org.junit.Test -import org.scalatest.junit._ - -/** - * @author sriedel - */ -class TestTemplates extends JUnitSuite with cc.factorie.util.FastLogging { - - @Test - def testFactorsOfDiffList() { - val template = new DotTemplateWithStatistics1[BooleanVariable] with Parameters { val weights = Weights(new la.DenseTensor1(BooleanDomain.size)) } - val b = new BooleanVariable(true) - val diff = new DiffList - b.set(false)(diff) - val factors = template.factors(diff) - assert(factors.head.asInstanceOf[DotTemplateWithStatistics1[BooleanVariable]#FactorType].family === template) - assert(factors.head.variables.head == b) - // TODO Uncomment this next line - //assert(factors.head.statistics.asInstanceOf[TemplateWithDotStatistics1[BooleanVariable]#Stat]._1 == false) - } - - @Test - def testCascadeUnroll() { - object Aggregate extends BooleanVariable { - val b1 = new BooleanVariable { - //override def unrollCascade: scala.Iterable[Var] = Seq(Aggregate) - } - } - val diff = new DiffList - val template = new DotTemplateWithStatistics1[BooleanVariable] with Parameters { - val weights = Weights(new la.DenseTensor1(BooleanDomain.size)) - override def unroll(v:Var) = v match { case Aggregate.b1 => Factor(Aggregate); case _ => Nil } - } - Aggregate.b1.set(true)(diff) - val factors = template.factors(diff) - assert(factors.exists(factor => factor.variables.head == Aggregate.b1)) - assert(factors.exists(factor => factor.variables.head == Aggregate)) - } - - @Test - def testVarArgs() { - class Aggregate extends BooleanVariable { - class Member extends BooleanVariable { - def owner = Aggregate.this - } - val members = for (i <- 0 until 10) yield new Member - } - val aggregate = new Aggregate - val template = new DotTemplate2[Aggregate,Vars[Aggregate#Member]] with Parameters { - val weights = Weights(new la.DenseTensor1(1)) - def unroll2(v: Vars[Aggregate#Member]) = sys.error("Not needed") - def unroll1(v: Aggregate) = Factor(v,Vars(v.members)) - //override def unroll2s(v: Aggregate#Member) = Factor(v.owner,Vars(v.owner.members)) - override def unroll(v:Var) = v match { case v:Aggregate#Member => Factor(v.owner, Vars(v.owner.members)); case _ => Nil } - override def statistics(v1:Aggregate#Value, v2:Vars[Aggregate#Member]#Value) = - new RealVariable(v2.count(_.booleanValue)).value // TODO Just create a RealValue; don't bother with a RealVariable - } - val diff = new DiffList - aggregate.members(0).set(true)(diff) - aggregate.members(2).set(true)(diff) - val factors = template.factors(diff).toSeq - assert(factors.size === 1) - assert(factors(0).variables(0) === aggregate) - assert(factors(0).variables(1) === Vars(aggregate.members)) - - - } -} - -//class SettingIteratorTests extends TestCase { -// val v1 = new BooleanVariable(true) -// val v2 = new BooleanVariable(true) -// val v3 = new BooleanVariable(true) -// -// //TODO: test fixed assignments -// -// def testLimitedSettingsIterator1 { -// val template = new TemplateWithDotStatistics1[BooleanVariable] { def statisticsDomains = Tuple1(BooleanDomain) } -// val factor = template.unroll1(v1).head -// assert(factor.valuesIterator(Set(factor._1.asInstanceOf[Variable])).size == 2) -// logger.debug("Template1 valuesIterator:") -// factor.valuesIterator(Set(factor._1.asInstanceOf[Variable])).foreach(logger.debug(_)) -// logger.debug("--------------------------------") -// -// template.addLimitedDiscreteValues(Seq(BooleanDomain.head.intValue)) -// template.isLimitingValuesIterator = true -// assert(factor.valuesIterator(Set(factor._1.asInstanceOf[Variable])).size == 1) -// logger.debug("Template1 limitedValuesIterator:") -// factor.valuesIterator(Set(factor._1.asInstanceOf[Variable])).foreach(logger.debug(_)) -// logger.debug("--------------------------------") -// } -// -// def testLimitedSettingsIterator2 { -// val template = new TemplateWithDotStatistics2[BooleanVariable, BooleanVariable] { -// def statisticsDomains = ((BooleanDomain, BooleanDomain)) -// def unroll1(v: BooleanVariable) = Factor(v1, v2) -// def unroll2(v: BooleanVariable) = sys.error("Not needed") -// } -// -// val factor = template.unroll1(v1).head -// assert(factor.valuesIterator(factor.variables.toSet).size == 4) -// logger.debug("Template2 valuesIterator:") -// factor.valuesIterator(factor.variables.toSet).foreach(logger.debug(_)) -// logger.debug("--------------------------------") -// -// template.addLimitedDiscreteValues(Seq((0,0),(1,1))) -// template.isLimitingValuesIterator = true -// -// assert(factor.valuesIterator(factor.variables.toSet).size == 2) -// logger.debug("Template2 limitedValuesIterator:") -// factor.valuesIterator(factor.variables.toSet).foreach(logger.debug(_)) -// logger.debug("--------------------------------") -// } -// -// def testLimitedSettingsIterator3 { -// val template = new TemplateWithDotStatistics3[BooleanVariable, BooleanVariable, BooleanVariable] { -// def statisticsDomains = ((BooleanDomain, BooleanDomain, BooleanDomain)) -// def unroll1(v: BooleanVariable) = Factor(v1, v2, v3) -// def unroll2(v: BooleanVariable) = sys.error("Not needed") -// def unroll3(v: BooleanVariable) = sys.error("Not needed") -// } -// -// var factor = template.unroll1(v1).head -// logger.debug("Template3 valuesIterator:") -// factor.valuesIterator(factor.variables.toSet).foreach(logger.debug(_)) -// assert(factor.valuesIterator(factor.variables.toSet).size == 8) -// logger.debug("--------------------------------") -// -// template.addLimitedDiscreteValues(Seq((0,0,0),(1,1,1))) -// template.isLimitingValuesIterator = true -// -// logger.debug("limiting factor? : " + factor.isLimitingValuesIterator) -// logger.debug("Template3 limitedValuesIterator:") -// factor.valuesIterator(factor.variables.toSet).foreach(logger.debug(_)) -// assert(factor.valuesIterator(factor.variables.toSet).size == 2) -// logger.debug("--------------------------------") -// } -// -//} diff --git a/src/test/scala/cc/factorie/optimize/TestDecisionTree.scala b/src/test/scala/cc/factorie/optimize/TestDecisionTree.scala deleted file mode 100644 index e14f7b3..0000000 --- a/src/test/scala/cc/factorie/optimize/TestDecisionTree.scala +++ /dev/null @@ -1,130 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.optimize - -import cc.factorie.app.classify.backend._ -import cc.factorie.la.{SingletonTensor1, SparseIndexedTensor1, Tensor1} -import cc.factorie.traversableExtras -import cc.factorie.util.BinarySerializer -import cc.factorie.util.CubbieConversions._ -import cc.factorie.variable.{DiscreteDomain, VectorDomain} -import org.junit.Test -import org.scalatest.junit.JUnitSuite - -class TestDecisionTree extends JUnitSuite { - @Test def testRegression(): Unit = { - implicit val random = new scala.util.Random(0) - val xs = Seq[Double](1, 2, 6, 8, 34, 45) - val ys = xs.map(x => x * x) - xs.zip(ys).foreach({case (x, y) => println(f"x: $x%f y: $y%f")}) - val instances = xs.zip(ys).map({ - case (x, y) => DecisionTreeTrainer.Instance(new SingletonTensor1(1, 0, x), new SingletonTensor1(1, 0, y), 1.0) - }) - val trainer = new RegressionTreeTrainer { minSampleSize = 1 } - val tree = trainer.train(instances) - val xs2 = Seq[Double](5, 7, 8, 23, 50) - val preds = xs2.map(x => DTree.score(new SingletonTensor1(1, 0, x), tree)(0)) - xs2.zip(preds).foreach({ - case (x, y) => println(f"x: $x%f y: $y%f") - }) - assert(preds.sameElements(Seq[Double](36, 36, 64, 1156, 2025)), "Dec tree regression didn't work!") - } - @Test def testRegression2(): Unit = { - // separate test to split real variables with negative values - implicit val random = new scala.util.Random(0) - val xs = Seq[Double](-10, -5, 1, 2, 6, 8) - val ys = xs.map(x => x * x * x) - xs.zip(ys).foreach({case (x, y) => println(f"x: $x%f y: $y%f")}) - val instances = xs.zip(ys).map({ - case (x, y) => DecisionTreeTrainer.Instance(new SingletonTensor1(1, 0, x), new SingletonTensor1(1, 0, y), 1.0) - }) - val trainer = new RegressionTreeTrainer { minSampleSize = 1 } - val tree = trainer.train(instances) - val xs2 = Seq[Double](-8, 5, 7, 9) - val preds = xs2.map(x => DTree.score(new SingletonTensor1(1, 0, x), tree)(0)) - xs2.zip(preds).foreach({ - case (x, y) => println(f"x: $x%f y: $y%f") - }) - assert(preds.sameElements(Seq[Double](-1000, 216, 216, 512)), "Dec tree regression didn't work!") - } - @Test def testClassification(): Unit = { - implicit val random = new scala.util.Random(0) - object featuresDomain extends VectorDomain { - val dimensionDomain = new DiscreteDomain(100) - } - object labelDomain extends DiscreteDomain(2) - val mean1 = (0 until 100).map(_ => random.nextDouble()).toSeq - val mean2 = (0 until 100).map(_ => random.nextDouble()).toSeq - val positiveExampleSeqs = (0 until 100).map(_ => (0 until 10).map(_ => mean1.zipWithIndex.sampleProportionally(_._1)._2)) - val negativeExampleSeqs = (0 until 100).map(_ => (0 until 10).map(_ => mean2.zipWithIndex.sampleProportionally(_._1)._2)) - val posExampleTensors = positiveExampleSeqs.map(pos => { - val t = new SparseIndexedTensor1(100) - pos.foreach(p => t += (p, 1.0)) - t - }) - val negExampleTensors = negativeExampleSeqs.map(neg => { - val t = new SparseIndexedTensor1(100) - neg.foreach(p => t += (p, 1.0)) - t - }) - - // add truth feature - if this doesn't help, we have a bug - // TODO add some tests for feature splitting -// posExampleTensors.foreach(t => t += (100, 1.0)) - val (trainSet, testSet) = (posExampleTensors.map(p => (p, 1)) ++ negExampleTensors.map(n => (n, 0))).shuffle.split(0.5) - val trainers = Seq( - new BoostingMulticlassTrainer(100), - new OnlineLinearMulticlassTrainer, - new RandomForestMulticlassTrainer(100, 100, 100), - new DecisionTreeMulticlassTrainer(new C45DecisionTreeTrainer)) - - val trainFeatures = trainSet.map(_._1) - val trainLabels = trainSet.map(_._2) - val testFeatures = testSet.map(_._1) - val testLabels = testSet.map(_._2) - def calcAccuracy(c: MulticlassClassifier[Tensor1]): Double = - testFeatures.map(i => c.classification(i).bestLabelIndex) - .zip(testLabels).count(i => i._1 == i._2).toDouble/testLabels.length - val evaluate = (c: MulticlassClassifier[Tensor1]) => { - val accuracy = calcAccuracy(c) - println(f"Test accuracy: $accuracy%1.4f") - assert(accuracy > 0.66) - } - val evaluate2 = (c1: MulticlassClassifier[Tensor1], c2: MulticlassClassifier[Tensor1]) => { - val accuracy1 = calcAccuracy(c1) - val accuracy2 = calcAccuracy(c2) - println(f"Test accuracy: $accuracy1%1.4f") - assert(accuracy1 > 0.66 && accuracy1 == accuracy2) - } - - for (trainer <- trainers) - trainer.simpleTrain(2, 100, trainLabels, trainFeatures, trainSet.map(_ => 1.0), evaluate) - - // confirm i can serialize dec trees - val file = java.io.File.createTempFile("FactorieTestFile", "serialize-randomforest").getAbsolutePath - - println("Testing deserialized:") - - val rfc = new RandomForestMulticlassTrainer(100, 100, 100).simpleTrain(2, 100, trainLabels, trainFeatures, trainSet.map(_ => 1.0), _ => {}) - BinarySerializer.serialize(rfc, file) - val rfc2 = BinarySerializer.deserialize[RandomForestMulticlassClassifier](file) - - evaluate2(rfc, rfc2) - - val bc = new BoostingMulticlassTrainer(100).simpleTrain(2, 100, trainLabels, trainFeatures, trainSet.map(_ => 1.0), _ => {}) - BinarySerializer.serialize(bc, file) - val bc2 = BinarySerializer.deserialize[BoostedMulticlassClassifier](file) - - evaluate2(bc, bc2) - } -} diff --git a/src/test/scala/cc/factorie/optimize/TestLearning.scala b/src/test/scala/cc/factorie/optimize/TestLearning.scala deleted file mode 100644 index ba6f481..0000000 --- a/src/test/scala/cc/factorie/optimize/TestLearning.scala +++ /dev/null @@ -1,129 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.optimize - -import cc.factorie._ -import cc.factorie.infer.InferByBPTree -import cc.factorie.la._ -import cc.factorie.model.{DotTemplateWithStatistics1, DotTemplateWithStatistics2, Parameters, TemplateModel} -import cc.factorie.util.LocalDoubleAccumulator -import cc.factorie.variable.{BinaryFeatureVectorVariable, CategoricalDomain, CategoricalVectorDomain, LabeledCategoricalVariable} -import org.junit.Assert._ -import org.junit.Test - -import scala.util.Random - -/** - * @author sameer - */ -class TestLearning { - - val random = new Random(0) - - object LabelDomain extends CategoricalDomain[String] - - object FeatureDomain extends CategoricalVectorDomain[String] - - class Features(val label: Label) extends BinaryFeatureVectorVariable[String] { - def domain = FeatureDomain - } - - class Label(val id: Int, labelStr: String) extends LabeledCategoricalVariable[String](labelStr) { - - def domain = LabelDomain - - val features = new Features(this) - } - - def createData(n: Int): Seq[Label] = { - (0 until n) map (i => { - val l = new Label(i, (i < n/2).toString) - l.features += ((l.intValue + 1) * (i % 5)).toString - l - }) - } - - def createModel(): TemplateModel with Parameters = - new TemplateModel with Parameters { - this += new DotTemplateWithStatistics1[Label] { - val weights = Weights(new DenseTensor1(LabelDomain.size)) - - for (i <- 0 until LabelDomain.size) - weights.value(i) = random.nextDouble - 0.5 - - override def toString = "bias" - } - this += new DotTemplateWithStatistics2[Label, Features] { - val weights = Weights(new DenseTensor2(LabelDomain.size, FeatureDomain.dimensionSize)) - - for (i <- 0 until LabelDomain.size) - for (j <- 0 until FeatureDomain.dimensionSize) - weights.value(i, j) = random.nextDouble - 0.5 - - def unroll1(l: Label) = Factor(l, l.features) - - def unroll2(f: Features) = Factor(f.label, f) - - override def toString = "obs" - } - } - - @Test - def testPseudolikelihood() { - val data = createData(10) - val model = createModel() - - val plExamples = data.map(d => new PseudolikelihoodExample(Seq(d), model)) - val plgrad = new LocalWeightsMapAccumulator(model.parameters.blankDenseMap) - val plvalue = new LocalDoubleAccumulator(0.0) - - val llExamples = data.map(d => new LikelihoodExample(Seq(d), model, InferByBPTree)) - val llgrad = new LocalWeightsMapAccumulator(model.parameters.blankDenseMap) - val llvalue = new LocalDoubleAccumulator(0.0) - - for ((ple, lle) <- plExamples.zip(llExamples)) { - val localPLgrad = new LocalWeightsMapAccumulator(model.parameters.blankDenseMap) - val localPLvalue = new LocalDoubleAccumulator(0.0) - ple.accumulateValueAndGradient(localPLvalue, localPLgrad) - ple.accumulateValueAndGradient(plvalue, plgrad) - - val localLLgrad = new LocalWeightsMapAccumulator(model.parameters.blankDenseMap) - val localLLvalue = new LocalDoubleAccumulator(0.0) - lle.accumulateValueAndGradient(localLLvalue, localLLgrad) - lle.accumulateValueAndGradient(llvalue, llgrad) - - // check local - assertEquals("local value does not match", localPLvalue.value, localLLvalue.value, 1.0e-7) - assertEquals("local tensors size does not match", localPLgrad.tensorSet.toSeq.size, localLLgrad.tensorSet.toSeq.size) - for ((a, llt) <- localLLgrad.tensorSet.toSeq) { - val plt = localPLgrad.tensorSet(a) - assertEquals("local tensor size for " + a + " does not match", plt.size, llt.size) - for (i <- 0 until llt.size) { - assertEquals(plt(i), llt(i), 1.0e-7) - } - } - } - // check global - assertEquals("global value does not match", plvalue.value, llvalue.value, 1.0e-7) - assertEquals("global tensors size does not match", plgrad.tensorSet.toSeq.size, llgrad.tensorSet.toSeq.size) - for ((a, llt) <- llgrad.tensorSet.toSeq) { - val plt = plgrad.tensorSet(a) - assertEquals("global tensor size for " + a + " does not match", plt.size, llt.size) - for (i <- 0 until llt.size) { - assertEquals("global tensor value for " + a + "(" + i + ") does not match", plt(i), llt(i), 1.0e-7) - } - } - - } - -} diff --git a/src/test/scala/cc/factorie/optimize/TestOptimize.scala b/src/test/scala/cc/factorie/optimize/TestOptimize.scala deleted file mode 100644 index 54fa9db..0000000 --- a/src/test/scala/cc/factorie/optimize/TestOptimize.scala +++ /dev/null @@ -1,221 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -//package cc.factorie.optimize -// -//import junit.framework._ -//import Assert._ -//import org.scalatest.junit.JUnitSuite -//import cc.factorie.maths.ArrayOps -//import cc.factorie.la._ -//import util.Random -//import cc.factorie.optimize._ -// -///** -// * Created by IntelliJ IDEA. -// * User: gdruck -// * Date: Sep 30, 2010 -// * Time: 2:02:12 PM -// * To change this template use File | Settings | File Templates. -// */ -// -//class TestOptimize extends TestCase { -// -// def testLineOptimizer = { -// var function = new SimpleFunction() -// var optimizer = new BackTrackLineOptimizer(function) -// // with a step of 1, this should jump over the -// // maximum and need to backtrack -// val gradient = new Array[Double](1) -// function.getOptimizableGradient(gradient) -// optimizer.optimize(gradient,1) -// println(function.optimizableParameter(0)) -// assertEquals(function.optimizableParameter(0),2.5,1e-6) -// } -// -// def testLMBFGS = { -// -// val rand = new Random(1) -// -// for (i <- 0 until 10) { -// val a = rand.nextInt(10) + 1.0 -// val b = rand.nextInt(10) + 1.0 -// val c = rand.nextInt(10) + 1.0 -// val d = rand.nextInt(10) + 1.0 -// -// var function = new BivariateQuadraticFunction(a,b,c,d) -// // this value is far enough away from the max that -// // optimizing will require at least two iterations -// var optimizer = new LimitedMemoryBFGS(function) -// optimizer.tolerance = 1e-8 -// optimizer.gradientTolerance = 1e-8 -// optimizer.optimize() -// -// //println(function.optimizableParameter(0) + " " + c/(2*a)) -// //println(function.optimizableParameter(1) + " " + d/(2*b)) -// -// assertEquals(function.optimizableParameter(0),c/(2*a),1e-4) -// assertEquals(function.optimizableParameter(1),d/(2*b),1e-4) -// } -// } -// -// def testGradientAscent = { -// -// val rand = new Random(1) -// -// for (i <- 0 until 10) { -// val a = rand.nextInt(10) + 1.0 -// val b = rand.nextInt(10) + 1.0 -// val c = rand.nextInt(10) + 1.0 -// val d = rand.nextInt(10) + 1.0 -// -// var function = new BivariateQuadraticFunction(a,b,c,d) -// // this value is far enough away from the max that -// // optimizing will require at least two iterations -// var optimizer = new GradientAscent(function) -// -// // reduce all tolerances to make sure -// // we do not stop until actually at the -// // maximum -// optimizer.tolerance = 1e-12 -// optimizer.gradientTolerance = 1e-8 -// optimizer.lineOptimizer.absTolx = 1e-8 -// optimizer.lineOptimizer.relTolx = 1e-8 -// optimizer.optimize() -// -// //println(function.optimizableParameter(0) + " " + c/(2*a)) -// //println(function.optimizableParameter(1) + " " + d/(2*b)) -// -// assertEquals(function.optimizableParameter(0),c/(2*a),1e-4) -// assertEquals(function.optimizableParameter(1),d/(2*b),1e-4) -// } -// } -// -// def testConjugateGradient = { -// -// val rand = new Random(1) -// -// for (i <- 0 until 10) { -// val a = rand.nextInt(10) + 1.0 -// val b = rand.nextInt(10) + 1.0 -// val c = rand.nextInt(10) + 1.0 -// val d = rand.nextInt(10) + 1.0 -// -// var function = new BivariateQuadraticFunction(a,b,c,d) -// // this value is far enough away from the max that -// // optimizing will require at least two iterations -// var optimizer = new ConjugateGradient(function) -// optimizer.tolerance = 1e-12 -// optimizer.gradientTolerance = 1e-8 -// optimizer.lineOptimizer.absTolx = 1e-8 -// optimizer.lineOptimizer.relTolx = 1e-8 -// try { -// optimizer.optimize() -// } -// catch { -// case e:Exception => e.printStackTrace() -// } -// -// //println(function.optimizableParameter(0) + " " + c/(2*a)) -// //println(function.optimizableParameter(1) + " " + d/(2*b)) -// -// assertEquals(function.optimizableParameter(0),c/(2*a),1e-4) -// assertEquals(function.optimizableParameter(1),d/(2*b),1e-4) -// } -// } -//} -// -//class BivariateQuadraticFunction(var a : Double, var b : Double, var c : Double, var d : Double) -// extends OptimizableByValueAndGradient { -// var x = new Array[Double](2) -// -// def numOptimizableParameters : Int = 2 -// -// def getOptimizableParameters(a:Array[Double]) = { -// assertTrue(a.length == 2) -// Array.copy(x, 0, a, 0, x.length) -// } -// -// def setOptimizableParameters(a:Array[Double]) = { -// assertTrue(a.length == 2) -// Array.copy(a, 0, x, 0, a.length) -// } -// -// def optimizableParameter(index:Int): Double = { -// assertTrue(index < 2) -// x(index) -// } -// -// def optimizableParameter_=(index:Int, d:Double): Unit ={ -// assertTrue(index < 2); -// x(index) = d -// } -// -// def optimizableValue : Double = { -// - a * x(0) * x(0) - b * x(1) * x(1) + c * x(0) + d * x(1) -// } -// -// def getOptimizableGradient(gradient:Array[Double]) = { -// assertTrue(gradient.length == 2) -// gradient(0) = -2 * a * x(0) + c -// gradient(1) = - 2 * b * x(1) + d -// } -// -//} -// -//class SimpleFunction extends OptimizableByValueAndGradient { -// var x : Double = 0.0 -// -// def numOptimizableParameters : Int = 1 -// -// def getOptimizableParameters(a:Array[Double]) = { -// assertTrue(a.length == 1) -// a(0) = x -// a -// } -// -// def setOptimizableParameters(a:Array[Double]) = { -// assertTrue(a.length == 1) -// x = a(0) -// } -// -// def optimizableParameter(index:Int): Double = { -// assertTrue(index == 0) -// x -// } -// -// def optimizableParameter_=(index:Int, d:Double): Unit ={ -// assertTrue(index == 0); -// x = d -// } -// -// def optimizableValue : Double = { -// -x*x + 5 * x -// } -// -// def getOptimizableGradient(a:Array[Double]) = { -// a(0) = -2 * x + 5 -// a -// } -//} -// -//object TestOptimizeRunner { -// def suite: TestSuite = { -// val suite = new TestSuite -// suite.addTestSuite(classOf[TestOptimize]) -// suite -// } -// -// def main(args: Array[String]) { -// junit.textui.TestRunner.run(suite) -// } -//} \ No newline at end of file diff --git a/src/test/scala/cc/factorie/optimize/TestSampleRank.scala b/src/test/scala/cc/factorie/optimize/TestSampleRank.scala deleted file mode 100644 index 7d2294d..0000000 --- a/src/test/scala/cc/factorie/optimize/TestSampleRank.scala +++ /dev/null @@ -1,93 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.optimize - -import cc.factorie._ -import cc.factorie.infer.GibbsSampler -import cc.factorie.la.{DenseTensor2, DenseTensor3, DenseTensor4, _} -import cc.factorie.model.DotTemplateWithStatistics2 -import cc.factorie.variable._ - -object TestSampleRank extends cc.factorie.util.FastLogging{ - object LabelDomain extends CategoricalDomain[String] - class Label(s:String, val instance:Instance) extends LabeledCategoricalVariable(s) { def domain = LabelDomain } - object InstanceDomain extends CategoricalVectorDomain[String] - class Instance(labelString:String) extends BinaryFeatureVectorVariable[String] { - def domain = InstanceDomain - val label = new Label(labelString, this) - // Add features that coorespond to label exactly - logger.debug("New Instance with Label "+labelString) - this += "f1"+labelString; logger.debug("TestSampleRank features "+value+" intArray "+value.asInstanceOf[SparseBinaryTensorLike1].toIntArray.toSeq) - this += "f2"+labelString; logger.debug("TestSampleRank features "+value+" intArray "+value.asInstanceOf[SparseBinaryTensorLike1].toIntArray.toSeq) - this += "f3"+labelString; logger.debug("TestSampleRank features "+value+" intArray "+value.asInstanceOf[SparseBinaryTensorLike1].toIntArray.toSeq) - } - val model = new DotTemplateWithStatistics2[Label,Instance] with Parameters { - val weights = Weights(new la.DenseTensor2(LabelDomain.size, InstanceDomain.dimensionSize)) - def unroll1(l:Label) = Factor(l, l.instance) - def unroll2(i:Instance) = Factor(i.label, i) - } - - def main(args:Array[String]): Unit = { - implicit val random = new scala.util.Random(0) - // Test Tensor index arithmetic - for (trials <- 1 to 100) { - val dim1 = random.nextInt(20)+1 - val dim2 = random.nextInt(20)+1 - val dim3 = random.nextInt(20)+1 - val dim4 = random.nextInt(20)+1 - logger.debug(List(dim1, dim2, dim3, dim4)) - var v = 0.0; var rdim1 = 0; var rdim2 = 0; var rdim3 = 0; var rdim4 = 0 - - val t2 = new DenseTensor2(dim1,dim2) - v = 0.0; for (i <- 0 until dim1; j <- 0 until dim2) { t2(i, j) = v; v += 1.0 } - v = 0.0; for (i <- 0 until dim1*dim2) { assert(t2(i) == v, "dim1="+dim1+" dim2="+dim2+" i="+i+" v="+v+" t(i)="+t2(i)+"\n"+t2); v += 1.0 } - rdim1 = random.nextInt(dim1) - rdim2 = random.nextInt(dim2) - val t2i = t2.singleIndex(rdim1, rdim2) - assert(t2.multiIndex(t2i) == (rdim1, rdim2)) - - val t3 = new DenseTensor3(dim1,dim2,dim3) - v = 0.0; for (i <- 0 until dim1; j <- 0 until dim2; k <- 0 until dim3) { t3(i, j, k) = v; v += 1.0 } - v = 0.0; for (i <- 0 until dim1*dim2*dim3) { assert(t3(i) == v); v += 1.0 } - rdim1 = random.nextInt(dim1) - rdim2 = random.nextInt(dim2) - rdim3 = random.nextInt(dim3) - val t3i = t3.singleIndex(rdim1, rdim2, rdim3) - assert(t3.multiIndex(t3i) == (rdim1, rdim2, rdim3)) - - val t4 = new DenseTensor4(dim1,dim2,dim3,dim4) - v = 0.0; for (i <- 0 until dim1; j <- 0 until dim2; k <- 0 until dim3; l <- 0 until dim4) { t4(i, j, k, l) = v; v += 1.0 } - v = 0.0; for (i <- 0 until dim1*dim2*dim3*dim4) { assert(t4(i) == v); v += 1.0 } - rdim1 = random.nextInt(dim1) - rdim2 = random.nextInt(dim2) - rdim3 = random.nextInt(dim3) - rdim4 = random.nextInt(dim4) - val t4i = t4.singleIndex(rdim1, rdim2, rdim3, rdim4) - assert(t4.multiIndex(t4i) == (rdim1, rdim2, rdim3, rdim4)) - } - - val labels = List("n", "y").map(s => new Instance(s)).map(_.label) - logger.debug("feature domain: "+InstanceDomain.dimensionDomain.mkString(" ")) - logger.debug("feature tensors:\n"+labels.map(l => l.instance.value.toString+"\n")) - val learner = new optimize.SampleRankTrainer(new GibbsSampler(model, HammingObjective), new cc.factorie.optimize.ConstantLearningRate) - //learner.logLevel = 10 - learner.processContexts(labels) - labels.foreach(l => l.set(0)(null)); logger.debug("Set to 0") - labels.foreach(l => logger.debug("feature="+l.instance.value+" value="+l.categoryValue+" target="+l.target.categoryValue+" score="+model.currentScore(l))) - labels.foreach(l => l.set(1)(null)); logger.debug("Set to 1") - labels.foreach(l => logger.debug("feature="+l.instance.value+" value="+l.categoryValue+" target="+l.target.categoryValue+" score="+model.currentScore(l))) - MaximizeDiscrete(labels, model); logger.debug("Set to max") - labels.foreach(l => logger.debug("feature="+l.instance.value+" value="+l.categoryValue+" target="+l.target.categoryValue+" score="+model.currentScore(l))) - logger.debug("Train accuracy "+labels.map(l => HammingObjective.currentScore(l)).sum / labels.length) - } -} diff --git a/src/test/scala/cc/factorie/optimize/TestSampleRank2.scala b/src/test/scala/cc/factorie/optimize/TestSampleRank2.scala deleted file mode 100644 index 0316798..0000000 --- a/src/test/scala/cc/factorie/optimize/TestSampleRank2.scala +++ /dev/null @@ -1,315 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.optimize - -import cc.factorie._ -import cc.factorie.infer.{InferByBPLoopy, MHSampler} -import cc.factorie.model._ -import cc.factorie.variable._ -import org.junit.{Before, Test} -import org.scalatest.junit.{AssertionsForJUnit, JUnitSuite} - -import scala.collection.mutable.ArrayBuffer - - -/** -Learning package tests needs to verify: -SampleRank requires --transition function over states [MH/Gibbs/Local/SettingsSampler] --training signal [problem specific] --update rule [MIRA,CW,PA etc] - - -(1) Show how to integrate various update rules with SampleRank/CD -[Learning Method] X [Update Rule] X [Parameter Averaging] X [Sampler] --SampleRank -GA -YES -MH --CD -MIRA -NO -Settings --Perceptron -CW --RL -ARROW --2nd order (2) Show examples of how to specify signals/objectives --Penalty/Reward --F1/Accuracy - -(3) Show examples of specifying proposal distribution - - -DOES MIRA/CW work with manually specified weightsSet? - - ** TEST SAMPLERANK** -test if model ranking agrees with training signal (randomly created?) - - - - */ - -class TestRealVariable extends JUnitSuite with cc.factorie.util.FastLogging { - @Test def testRealVariableWorks() { - implicit val random = new scala.util.Random(0) - class Prob(val scoreVal:Double) extends RealVariable(scoreVal) - class Data(val scoreVal: Double, val truth: Boolean) extends LabeledBooleanVariable(truth) { - val score=new Prob(scoreVal) - } - val trainings=new ArrayBuffer[Data] - trainings+=new Data(0.1, false) - trainings+=new Data(0.4, false) - trainings+=new Data(0.6, true) - trainings+=new Data(0.8, true) - class SimpleTemplate(model: Parameters) extends DotTemplateWithStatistics2[Data, Prob]{ - val weights = model.Weights(new la.DenseTensor2(BooleanDomain.dimensionSize, RealDomain.dimensionSize)) - def unroll1(data: Data) = Factor(data, data.score) - def unroll2(prob: Prob) = Nil - } - val model = new TemplateModel with Parameters { addTemplates(new SimpleTemplate(this)) } - val objective = new HammingTemplate[Data, Data#TargetType] - - val pieces = new ArrayBuffer[LikelihoodExample[Iterable[DiscreteVar],Model]] - pieces += new LikelihoodExample(trainings.toIterable, model, InferByBPLoopy) - Trainer.batchTrain(model.parameters, pieces, evaluate = () => { - logger.debug("Accuracy after sampling: " + objective.accuracy(trainings)) - }) - } -} - - -//extends JUnitSuite with TestUtils{ -class TestSampleRank2 extends AssertionsForJUnit with cc.factorie.util.FastLogging { - val numVariables = 4 - val numProposals = 1000 - - class MyBool extends BooleanVariable(false) - { - var next: MyBool = null - var prev: MyBool = null - } - // - //create variables with a ring graph structure - var bools: Seq[MyBool] = null - - val trainingSignal = new CombinedModel ( - // - //this template unrolls a "ring" structured graphical model - new TupleTemplateWithStatistics2[MyBool, MyBool] { - def unroll1(b: MyBool) = Factor(b, b.next) - def unroll2(b: MyBool) = Factor(b.prev, b) - def score(v1:MyBool#Value, v2:MyBool#Value): Double = { - //var v1 = s._1 - //var v2 = s._2 - if (v1 == v2) -1.0 - else 1.0 - } - } - ) - var model: TemplateModel = null - - - class AllPairsProposer(model: CombinedModel) extends MHSampler[Null](model)(new scala.util.Random(0)) - { - def propose(context: Null)(implicit delta: DiffList): Double = - { - for (b <- bools) b.set(random.nextBoolean())(delta) - 0.0 - } - } - -// -// abstract class AllPairsCD1Proposer(model: CombinedModel) extends ContrastiveDivergence[Null](model) -// { -// def propose(context: Null)(implicit delta: DiffList): Double = -// { -// for (b <- bools) b.set(random.nextBoolean)(delta) -// 0.0 -// } -// } - - @Before def initialize() = - { - logger.debug("TESTING LEARNING FRAMEWORK") - bools = (for (i <- 0 until numVariables) yield new MyBool).toSeq - for (i <- 0 until bools.length - 1) - { - bools(i).next = bools(i + 1) - bools(i + 1).prev = bools(i) - } - bools(0).prev = bools(bools.length - 1) - bools(bools.length - 1).next = bools(0) - logger.debug("NUM BOOL VARS: " + bools.size) - - model = new TemplateModel with Parameters { - this += new DotTemplateWithStatistics2[MyBool, MyBool] { - //def statisticsDomains = ((BooleanDomain, BooleanDomain)) - val weights = Weights(new la.DenseTensor2(BooleanDomain.size, BooleanDomain.size)) - def unroll1(b: MyBool) = Factor(b, b.next) - def unroll2(b: MyBool) = Factor(b.prev, b) - } - } - } - - - def decodeConfiguration(v: Int, configuration: Seq[MyBool]): Unit = - { - val result = new ArrayBuffer[Boolean] - var tmpV = v - while (tmpV != 0) - { - result += (tmpV % 2 == 1) - tmpV /= 2 - } - //pad - for (i <- 0 until configuration.length - result.length) - result += false - - for (i <- 0 until configuration.length) - configuration(i).set(result(i))(null) - } - - def checkAllPairs() = - { - // - //Test extremes - var fpErrors = 0 - var fnErrors = 0 - for (i <- 0 until math.pow(2, bools.length).toInt) - { - decodeConfiguration(i, bools) - val modelScoreI = model.currentScore(bools) - val truthScoreI = trainingSignal.currentScore(bools) - - for (j <- i + 1 until math.pow(2, bools.length).toInt) - { - decodeConfiguration(j, bools) - val modelScoreJ = model.currentScore(bools) - val truthScoreJ = trainingSignal.currentScore(bools) - - if (truthScoreI > truthScoreJ) - if (modelScoreI <= modelScoreJ) - fpErrors += 1 - //assert(modelScoreI>modelScoreJ) - if (truthScoreI < truthScoreJ) - if (modelScoreI >= modelScoreJ) - fnErrors += 1 - //assert(modelScoreI maxScore) { - maxScore = modelScore - maxConfig = i - } -} - decodeConfiguration(maxConfig, bools) - maxConfig - } - -def checkAllAgainstTruth = - { - assignMAP(bools,trainingSignal) - val mapScore = model.scoreAll(bools) - var errors = 0 - for(i<-0 until Math.pow(2,bools.length).toInt) -{ - decodeConfiguration(i,bools) - val configScore = model.scoreAll(bools) - if(configScore>mapScore) - errors +=1 -} - logger.debug("NUM CD ERRORS: " + errors) - assert(errors==0) - } - -@Test def verifyCD1 = - { - val trainer = new AllPairsCD1Proposer(model) with GradientAscentUpdates - val mapConfig = assignMAP(bools,trainingSignal) - for(i<-0 until numProposals) -{ - // - //initialize to MAP - decodeConfiguration(mapConfig,bools) - trainer.process(1) -} - checkAllAgainstTruth - }*/ -} diff --git a/src/test/scala/cc/factorie/util/TestAssignmentSolver.scala b/src/test/scala/cc/factorie/util/TestAssignmentSolver.scala deleted file mode 100644 index f0285d6..0000000 --- a/src/test/scala/cc/factorie/util/TestAssignmentSolver.scala +++ /dev/null @@ -1,79 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.util -import cc.factorie.la.DenseTensor2 -import org.junit.Assert._ -import org.junit._ - -/** - * User: apassos - * Date: 6/3/13 - * Time: 12:00 PM - */ -class TestAssignmentSolver { - @Test def testSingleSource() { - val weights = new DenseTensor2(1, 10) - weights(0, 5) = 5 - val result = new AssignmentSolver(weights).solve() - assertEquals(1, result.length) - assertEquals((0,5), result.head) - } - - @Test def testSingleTarget() { - val weights = new DenseTensor2(10, 1) - weights(5, 0) = 5 - val result = new AssignmentSolver(weights).solve() - assertEquals(1, result.length) - assertEquals((5,0), result.head) - } - - @Test def testAscendingChain() { - val weights = new DenseTensor2(5, 5) - for (i <- 0 until 5) weights(i, i) = 2 - for (i <- 0 until 4) weights(i,i+1) = 3 - val result = new AssignmentSolver(weights).solve() - assertEquals(4, result.length) - val parents = Array.fill(5)(-1) - for ((s,t) <- result) parents(s) = t - for (i <- 0 until 4) assertEquals(i+1, parents(i)) - } - - - @Test def testAscendingChainLose() { - val weights = new DenseTensor2(5, 5) - for (i <- 0 until 5) weights(i, i) = 4 - for (i <- 0 until 4) weights(i,i+1) = 3 - val result = new AssignmentSolver(weights).solve() - assertEquals(5, result.length) - val parents = Array.fill(5)(-1) - for ((s,t) <- result) parents(s) = t - for (i <- 0 until 5) assertEquals(i, parents(i)) - } - - @Test def testHeaviestMatch() { - val weights = new DenseTensor2(3, 3) - weights(0, 0) = 0 - weights(0, 1) = 3 - weights(0, 2) = 0 - weights(1, 0) = 2 - weights(1, 1) = 10 - weights(1, 2) = 3 - weights(2, 0) = 0 - weights(2, 1) = 5 - weights(2, 2) = 0 - val result = new AssignmentSolver(weights).solve() - val parents = Array.fill(3)(-1) - for ((s,t) <- result) parents(s) = t - assertEquals(1, parents(1)) - } -} diff --git a/src/test/scala/cc/factorie/util/TestCmdOptions.scala b/src/test/scala/cc/factorie/util/TestCmdOptions.scala deleted file mode 100644 index 89aaa9a..0000000 --- a/src/test/scala/cc/factorie/util/TestCmdOptions.scala +++ /dev/null @@ -1,124 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.util - -import java.io.File - -import org.scalatest.{Matchers, FlatSpec} - -/** - * @author John Sullivan - */ -class TestCmdOptions extends FlatSpec with Matchers { - - "CmdOptions" should "parse comma-separated lists properly" in { - val opts = new DefaultCmdOptions { - val dList = new CmdOption("dlist", List.empty[Double], "", "") - val sList = new CmdOption("slist", List.empty[String], "", "") - val iList = new CmdOption("ilist", List.empty[Int], "" ,"") - } - opts parse Array("--dlist=1.0,2,3.5", "--slist=Foo,Bar,Baz", "--ilist=2,4,6,8,10") - - assert(opts.dList.value == List(1.0, 2.0, 3.5)) - assert(opts.sList.value == List("Foo", "Bar", "Baz")) - assert(opts.iList.value == List(2,4,6,8,10)) - } - - it should "parse primitive types properly" in { - val opts = new DefaultCmdOptions { - val int = new CmdOption("i", 2, "", "") - val long = new CmdOption("l", 1000L, "", "") - val float = new CmdOption("f", 1.0f, "", "") - val double = new CmdOption("d", 1.0, "", "") - val string = new CmdOption("s", "s", "", "") - val boolean = new CmdOption("b", false, "", "") - } - opts parse Array("--i=13", "--l=13", "--f=13", "--d=13", "--s=thirteen", "--b=true") - - assert(opts.int.value == 13) - assert(opts.long.value == 13L) - assert(opts.float.value == 13.0f) - assert(opts.double.value == 13.0) - assert(opts.string.value == "thirteen") - assert(opts.boolean.value) - } - - it should "deal with the short bool case" in { - val opts = new DefaultCmdOptions { - val bool = new CmdOption("long-name", false, "", "", false, 'b') - } - opts parse Array("-b") - - assert(opts.bool.value) - } - - it should "carry through default args" in { - val opts = new DefaultCmdOptions { - val str = new CmdOption("with-default", "default-val", "", "") - } - opts parse Array.empty[String] - assert(opts.str.value == "default-val") - } - - it should "parse space-separated args" in { - val opts = new DefaultCmdOptions { - val str = new CmdOption("separate", "", "", "") - val dub = new CmdOption("next", 5.7, "", "") - } - opts parse Array("--separate", "argument", "--next", "2.34") - assert(opts.str.value == "argument") - assert(opts.dub.value == 2.34) - } - - it should "parse space-separated lists" in { - val opts = new DefaultCmdOptions { - val dubList = new CmdOption("params", List.empty[Double], "", "") - val name = new CmdOption("name", "", "", "") - } - opts parse "--params 1.0 5.3 13 2943.32 --name=steve".split(" ") - assert(opts.dubList.value == List(1.0, 5.3, 13.0, 2943.32)) - assert(opts.name.value == "steve") - } - - it should "parse strings into Files properly" in { - val opts = new CmdOptions { - val file = new CmdOption[File]("file", null, "", "") - } - val tmp = File.createTempFile("test-dir", "test1") - opts parse s"--file ${tmp.getAbsolutePath}".split(" ") - assert(opts.file.value.getAbsolutePath == tmp.getAbsolutePath) - } - - it should "parse string lists into file lists properly" in { - val opts = new CmdOptions { - val files = new CmdOption[List[File]]("files", List.empty, "", "") - } - val tmps = (1 to 3).map(i => File.createTempFile("test-dir", "test" + i)) - opts parse s"--files=${tmps.map(_.getAbsolutePath).mkString(",")}".split(" ") - assert(opts.files.value.zip(tmps).forall{case (a, b) => a.getAbsolutePath == b.getAbsolutePath}) - } - - it should "parse string maps" in { - val target = Map("foo" -> "bar", "baz" -> "quux") - val opts1 = new CmdOptions { - val strMap = new CmdOption[Map[String, String]]("string-map", Map.empty, "", "") - } - opts1 parse "--string-map foo:bar baz:quux".split(" ") - assert(target == opts1.strMap.value) - val opts2 = new CmdOptions { - val strMap = new CmdOption[Map[String, String]]("string-map", Map.empty, "", "") - } - opts2 parse "--string-map=foo:bar,baz:quux".split(" ") - assert(target == opts2.strMap.value) - } -} diff --git a/src/test/scala/cc/factorie/util/TestDoubleSeq.scala b/src/test/scala/cc/factorie/util/TestDoubleSeq.scala deleted file mode 100644 index a04f019..0000000 --- a/src/test/scala/cc/factorie/util/TestDoubleSeq.scala +++ /dev/null @@ -1,61 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.util - -import cc.factorie.la.{SparseTensor1, Tensor1} -import org.scalatest._ - -class TestDoubleSeq extends FlatSpec with Matchers { - - val nSamples = 1e7.toInt - - implicit val random = scala.util.Random - - "DenseDoubleSeq.sampleIndex" should "always sample a correct index" in { - - val masses = Array[Double](0, 10, 0, 1, 0) - val totalMass = masses.sum - val props = masses.map(_ / totalMass) - - val seq = new ArrayDoubleSeq(masses) - - val samples = (1 to nSamples).foldLeft(Array.fill(masses.size)(0.0)) { case (acc, i) => - acc(seq.sampleIndex(totalMass)) += 1.0 - acc - } map (_ / nSamples) - - (Tensor1(props:_*) - Tensor1(samples:_*)).twoNorm should be <= 1e-2 - - } - - "SparseDoubleSeq.sampleIndex" should "always sample a correct index" in { - - val masses = Array[Double](0, 10, 0, 1, 0) - val totalMass = masses.sum - val props = masses.map(_ / totalMass) - - val seq = new SparseTensor1(masses.size) - masses.zipWithIndex foreach { case (v, i) => - seq += (i, v) - } - - val samples = (1 to nSamples).foldLeft(Array.fill(masses.size)(0.0)) { case (acc, i) => - acc(seq.sampleIndex(totalMass)) += 1.0 - acc - } map (_ / nSamples) - - (Tensor1(props:_*) - Tensor1(samples:_*)).twoNorm should be <= 1e-2 - - } - -} diff --git a/src/test/scala/cc/factorie/util/TestEvaluatableClustering.scala b/src/test/scala/cc/factorie/util/TestEvaluatableClustering.scala deleted file mode 100644 index fc67877..0000000 --- a/src/test/scala/cc/factorie/util/TestEvaluatableClustering.scala +++ /dev/null @@ -1,57 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.util - -import org.junit.Test - -class TestEvaluatableClustering { - @Test def testBasicEvaluatableClustering(): Unit = { - var g = new BasicEvaluatableClustering(for (i <- 1 to 10) yield Tuple2(i.toString, "a")) - var c = new BasicEvaluatableClustering(for (i <- 1 to 10) yield Tuple2(i.toString, "b")) - assert(ClusterF1Evaluation.Pairwise(c, g).f1 == 1.0) - assert(ClusterF1Evaluation.BCubed(c, g).f1 == 1.0) - assert(ClusterF1Evaluation.BCubedNoSingletons(c, g).f1 == 1.0) - assert(ClusterF1Evaluation.MUC(c, g).f1 == 1.0) - assert(ClusterF1Evaluation.CeafE(c, g).f1 == 1.0) - assert(ClusterF1Evaluation.CeafM(c, g).f1 == 1.0) - assert(g.clusterIds.size == c.clusterIds.size) - assert(g == c) - //println("Before Serialization\n"+c.toString) - c = EvaluatableClustering(c.toString) - //println("After Serialization\n"+c.toString) - assert(g == c) - - g = new BasicEvaluatableClustering(for (i <- 1 to 10) yield Tuple2(i.toString, i.toString)) - c = new BasicEvaluatableClustering(for (i <- 1 to 10) yield Tuple2(i.toString, (i+1).toString)) - assert(ClusterF1Evaluation.Pairwise(c, g).f1 == 1.0) - assert(ClusterF1Evaluation.BCubed(c, g).f1 == 1.0) - assert(ClusterF1Evaluation.BCubedNoSingletons(c, g).f1 == 1.0) - assert(g == c) - //println("Before Serialization\n"+c.toString) - c = EvaluatableClustering(c.toString) - //println("After Serialization\n"+c.toString) - assert(g == c) - - g = new BasicEvaluatableClustering(for (i <- 1 to 3) yield Tuple2(i.toString, (i/2).toString)) - c = new BasicEvaluatableClustering(for (i <- 1 to 3) yield Tuple2(i.toString, "a")) - //println("F1="+PairwiseClusterEvaluation(c, g).f1) - assert(ClusterF1Evaluation.Pairwise(c, g).f1 == 0.5) - assert(g != c) - //println("Before Serialization\n"+c.toString) - val g2 = EvaluatableClustering(g.toString) - //println("After Serialization\n"+c.toString) - assert(g == g2) - assert(g2 == g) - } - -} \ No newline at end of file diff --git a/src/test/scala/cc/factorie/util/TestHyperParameterSearcher.scala b/src/test/scala/cc/factorie/util/TestHyperParameterSearcher.scala deleted file mode 100644 index 7d2d340..0000000 --- a/src/test/scala/cc/factorie/util/TestHyperParameterSearcher.scala +++ /dev/null @@ -1,51 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.util -import org.junit.Assert._ -import org.junit.Test - -import scala.concurrent.Future - -/** - * User: apassos - * Date: 6/9/13 - * Time: 7:51 AM - */ -class TestHyperParameterSearcher { - @Test def testSimpleParamSearch() { - class CmdClass extends CmdOptions { - val value = new CmdOption("value", 0.0, "DOUBLE", "A simple value") - } - val cmds = new CmdClass - val uniformHyper = new UniformDoubleSampler(0.0, 1.0) - def test(args: Array[String]) = { - val t = new CmdClass - t.parse(args) - Future.successful(t.value.value) - } - val optimizer = new HyperParameterSearcher(cmds, - Seq(HyperParameter(cmds.value, uniformHyper)), - test, 10, 10, secondsToSleep = 1) - optimizer.optimize() - assertTrue(cmds.value.hasValue) - assertTrue(cmds.value.value > 0.8) - val seqHyper = new SampleFromSeq(Seq(0.0, 0.1, 0.5, 1.0)) - val cmds2 = new CmdClass - val optimizer2 = new HyperParameterSearcher(cmds2, - Seq(HyperParameter(cmds2.value, seqHyper)), - test, 10, 10, secondsToSleep = 1) - optimizer2.optimize() - assertTrue(cmds2.value.hasValue) - assertEquals(cmds2.value.value, 1.0, 0.0001) - } -} diff --git a/src/test/scala/cc/factorie/util/TestIntAndDoubleSeqCubbie.scala b/src/test/scala/cc/factorie/util/TestIntAndDoubleSeqCubbie.scala deleted file mode 100644 index ed03e02..0000000 --- a/src/test/scala/cc/factorie/util/TestIntAndDoubleSeqCubbie.scala +++ /dev/null @@ -1,59 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.util - -import cc.factorie.db.mongo.MongoCubbieConverter -import org.scalatest._ - -/** - * @author John Sullivan - */ -class TestIntAndDoubleSeqCubbie extends FlatSpec with Matchers { - - def doubleSeqFixture = new { - class DoubleSeqCubbie extends Cubbie { - val doubleSeq = DoubleSeqSlot("test") - } - val c = new DoubleSeqCubbie - val a = Array(1.9, 122323.999, -1293.99127361) - val d = DoubleSeq(a) - c.doubleSeq := d - } - - "DoubleSeqSlot" should "work" in { - val f = doubleSeqFixture - import f._ - assert(c.doubleSeq.value.asArray.zip(f.a).forall{case (x, y) => x == y}) - } - - "IntSeqSlot" should "work" in { - val c = new Cubbie { - val intSeq = IntSeqSlot("test") - } - val a = Array(1, 1999, 49923, -237194) - val i = new ArrayIntSeq(a) - c.intSeq := i - assert(c.intSeq.value.asArray.zip(a).forall{case (x, y) => x == y}) - } - - "IntSeq conversion" should "work" in { - class MyCubbie extends Cubbie { val i = IntSeqSlot("test") } - val a = new ArrayIntSeq(Array(1,2,3)) - val c = new MyCubbie - c.i := a - val dbo = MongoCubbieConverter.eagerDBO(c) - val c2 = MongoCubbieConverter.eagerCubbie(dbo, () => new MyCubbie) - assert(c2.i.value.asSeq == c.i.value.asSeq) - } - -} diff --git a/src/test/scala/cc/factorie/util/TestJsonCubbieConverter.scala b/src/test/scala/cc/factorie/util/TestJsonCubbieConverter.scala deleted file mode 100644 index 240e30d..0000000 --- a/src/test/scala/cc/factorie/util/TestJsonCubbieConverter.scala +++ /dev/null @@ -1,68 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.util - -import cc.factorie.app.nlp._ -import cc.factorie.app.nlp.parse.{ParseTree, ParseTreeLabelDomain} -import cc.factorie.app.nlp.pos.{PennPosDomain, PennPosTag} -import cc.factorie.util.JsonCubbieConverter._ -import org.json4s.JsonAST._ -import org.json4s.jackson.JsonMethods._ -import org.scalatest.{FlatSpec, Matchers} - -/** - * @author John Sullivan - */ -class TestJsonCubbieConverter extends FlatSpec with Matchers { - class BasicTestCubbie extends Cubbie { - val double = new DoubleSlot("double") - val int = new IntSlot("int") - val string = new StringSlot("string") - } - - def fix = new { - val doc1 = new Document("If it's your job to eat a frog, it's best to do it first thing in the morning. And If it's your job to eat two frogs, it's best to eat the biggest one first.") - DocumentAnnotatorPipeline(segment.DeterministicNormalizingTokenizer, segment.DeterministicSentenceSegmenter).process(doc1) - for (token <- doc1.tokens) token.attr += new PennPosTag(token, token.positionInSentence % PennPosDomain.size) - for (sentence <- doc1.sentences) sentence.attr += new ParseTree(sentence, Range(0, sentence.length).toArray, Range(0, sentence.length).map(_ % ParseTreeLabelDomain.length).toArray) - doc1.annotators(classOf[PennPosTag]) = this.getClass - doc1.annotators(classOf[ParseTree]) = this.getClass - } - - "JsonCubbieConverter" should "serialize and deserialize primitive values properly" in { - val c = new BasicTestCubbie() - - c.double set 3.2043 - c.int set 5 - c.string set "test string" - - val json = toJson(c) - val expectedJson = JObject(JField("double", JDouble(3.2043)), JField("int", JInt(5)), JField("string", JString("test string"))) - - assert(json == expectedJson, "expected %s but got %s".format(compact(render(expectedJson)), compact(render(json)))) - - val deserialized = toCubbie(json, {() => new BasicTestCubbie}) - assert(deserialized._map == c._map, "expected %s but got %s".format(c._map, deserialized._map)) - } - - it should "serialize and deserialize DocumentCubbies properly" in { - val f = fix - import f._ - - val c = new StandardDocumentCubbie() := doc1 - val json = toJson(c) - val doc2 = toCubbie(json, {() => new StandardDocumentCubbie()}).document - assert(doc1.tokens.toSeq.map(_.string) == doc2.tokens.toSeq.map(_.string)) - assert(doc1.tokens.toSeq.map(_.posTag.categoryValue) == doc2.tokens.toSeq.map(_.posTag.categoryValue)) - } -} diff --git a/src/test/scala/cc/factorie/variable/TestBagOfWords.scala b/src/test/scala/cc/factorie/variable/TestBagOfWords.scala deleted file mode 100644 index 0b1fbdf..0000000 --- a/src/test/scala/cc/factorie/variable/TestBagOfWords.scala +++ /dev/null @@ -1,42 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.variable - -import org.scalatest.{FlatSpec, Matchers} - -/** - * @author John Sullivan - */ -class TestBagOfWords extends FlatSpec with Matchers { - - "BagOfWords" should "work subtract properly with new values" in { - val b1 = new BagOfWords() - b1 -=("foo", 1.0) - assert(b1.asHashMap.keySet == Set("foo")) - assert(b1.asHashMap("foo") == -1.0) - } - - "BagOfWords" should "initialize from initialWords string" in { - val b = new BagOfWords("the quick brown fox jumps over the lazy dog".split(" ")) - assert(b.size == 8) - assert(b.asHashMap.size == 8) - - assert(b.asHashMap("the") == 2.0) - assert(b.asHashMap("fox") == 1.0) - assert(b("the") == 2.0) - assert(b("fox") == 1.0) - - assert(b.contains("the")) - assert(!b.contains("abcd")) - } -} diff --git a/src/test/scala/cc/factorie/variable/TestCategoricalDomain.scala b/src/test/scala/cc/factorie/variable/TestCategoricalDomain.scala deleted file mode 100644 index e11098b..0000000 --- a/src/test/scala/cc/factorie/variable/TestCategoricalDomain.scala +++ /dev/null @@ -1,70 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.variable - -import org.junit.Test -import org.scalatest.junit._ - - -class TestCategoricalDomain extends JUnitSuite with cc.factorie.util.FastLogging { - - @Test - def testAdditionsWithCounting(): Unit = { - val domain = new CategoricalDomain[String](List("yes", "no")) - domain.freeze() - assert(domain.size == 2) // domain should have 2 categories - assert(domain.count("yes") == 0) // nothing should be counted yet - domain.gatherCounts = true - domain ++= List("yes") // ++= should increment the count - assert(domain.count("yes") == 1) - domain += "yes" // += should also increment the count - assert(domain.count("yes") == 2) - } - - @Test - def testCategoricalDomain(): Unit = { - - object FootSizeDomain extends CategoricalDomain[String] { - value("LARGE") - value("SMALL") - freeze() - } - // test domain properties - assert(FootSizeDomain.dimensionSize == 2) - assert(FootSizeDomain.dimensionDomain == FootSizeDomain) - - // Add new category - // here should raise an exception, but current implementation just ignore it - FootSizeDomain += "MEDIUM" - assert(FootSizeDomain.value("MEDIUM") == null) - FootSizeDomain.unfreeze() - FootSizeDomain += "MEDIUM" - assert(FootSizeDomain.value("MEDIUM") != null) - assert(FootSizeDomain.dimensionSize == 3) - FootSizeDomain.freeze() - - // read category values, should be an instance of CategoricalValue[String] - assert(FootSizeDomain.head.isInstanceOf[CategoricalValue[String]]) - assert(FootSizeDomain.head.category == "LARGE") - - class FootSize(category:String) extends CategoricalVariable(category) { - override def domain = FootSizeDomain - } - // test domain variable - var v = new FootSize("LARGE") - assert(v.value == FootSizeDomain.value("LARGE")) - assert(v.domain == FootSizeDomain) - - } - -} diff --git a/src/test/scala/cc/factorie/variable/TestCategoricalVectorVariable.scala b/src/test/scala/cc/factorie/variable/TestCategoricalVectorVariable.scala deleted file mode 100644 index e8f04b4..0000000 --- a/src/test/scala/cc/factorie/variable/TestCategoricalVectorVariable.scala +++ /dev/null @@ -1,41 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.variable - -import cc.factorie.la.GrowableSparseBinaryTensor1 -import org.junit.Test -import org.scalatest.junit._ - - -class TestCategoricalVectorVariable extends JUnitSuite with cc.factorie.util.FastLogging { - - @Test - def testCategoricalVectorVariable(): Unit = { - object DocumentDomain extends CategoricalVectorDomain[String] - class Document extends CategoricalVectorVariable[String] { - - // the value is not set in CategoricalVectorVariable - set(new GrowableSparseBinaryTensor1(domain.dimensionDomain))(null) - - override def domain: CategoricalVectorDomain[String] = DocumentDomain - } - - val document = new Document - document += "hello" - document += "world" - document ++= Seq("a", "b", "c") - - println(document.activeCategories.contains("hello")) - } - -} diff --git a/src/test/scala/cc/factorie/variable/TestDiff.scala b/src/test/scala/cc/factorie/variable/TestDiff.scala deleted file mode 100644 index d0b302d..0000000 --- a/src/test/scala/cc/factorie/variable/TestDiff.scala +++ /dev/null @@ -1,44 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.variable - -import org.junit.Test -import org.scalatest.junit._ - - -class TestDiff extends JUnitSuite with cc.factorie.util.FastLogging { - - @Test - def testCreateDiff() { - //this test just shows how variables create diff objects that point to them - val b = new BooleanVariable(true) - val diff = new DiffList - b.set(false)(diff) - assert(diff(0).variable === b) - } - - @Test - def testDiffUndoAndRedo() { - val i = new IntegerVariable(2) - - val d = new DiffList - i.set(3)(d) // This method will create a Diff object and append it to the DiffList d. - assert(3 == i.value) - - d.undo() - assert(2 == i.value) - - d.redo() - assert(3 == i.value) - } -} diff --git a/src/test/scala/cc/factorie/variable/TestDiscreteSeqVariable.scala b/src/test/scala/cc/factorie/variable/TestDiscreteSeqVariable.scala deleted file mode 100644 index 57ed870..0000000 --- a/src/test/scala/cc/factorie/variable/TestDiscreteSeqVariable.scala +++ /dev/null @@ -1,43 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.variable - -import org.junit.Assert._ -import org.junit.Test -import org.scalatest.junit._ - - -class TestDiscreteSeqVariable extends JUnitSuite with cc.factorie.util.FastLogging { - - object DiceDomain extends DiscreteDomain(6) - object DiceSeqDomain extends DiscreteSeqDomain { def elementDomain = DiceDomain } - // number of dice we want to pick - class DiceSeq(num:Int) extends DiscreteSeqVariable(num) { def domain = DiceSeqDomain } - - @Test - def testDiscreteSeqDomain(): Unit = { - } - - @Test - def testDiscreteSeqVariable(): Unit = { - // lets create a three dice sequence - val ds1 = new DiceSeq(3) - // each dice should be initialized to 0 - assertArrayEquals(Array(0,0,0), ds1.toSeq.map(_.intValue).toArray) - - // set value for an element - ds1.set(0, 1)(null) - assertArrayEquals(Array(1,0,0), ds1.toSeq.map(_.intValue).toArray) - } - -} diff --git a/src/test/scala/cc/factorie/variable/TestDiscreteVariable.scala b/src/test/scala/cc/factorie/variable/TestDiscreteVariable.scala deleted file mode 100644 index 799ed1e..0000000 --- a/src/test/scala/cc/factorie/variable/TestDiscreteVariable.scala +++ /dev/null @@ -1,60 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.variable - -import org.junit.Test -import org.scalatest.junit._ - - -class TestDiscreteVariable extends JUnitSuite with cc.factorie.util.FastLogging { - - // dice has six possible values, from 0 to 5 - object DiceDomain extends DiscreteDomain(6) - - class Dice(v: Int) extends DiscreteVariable(v) { - override def domain: DiscreteDomain = DiceDomain - } - - @Test - def testDiscreteVariable(): Unit = { - val v = new Dice(0) - - assert(v.domain == DiceDomain) - assert(v.value == DiceDomain(0)) - assert(v.intValue == 0) - - v.set(2)(null) - assert(v.value == DiceDomain(2)) - - v.set(DiceDomain(4))(null) - assert(v.value == DiceDomain(4)) - } - - - @Test - def testDiscreteVariableDiff(): Unit = { - val d = new DiffList - - val v = new Dice(0) - assert(v.value == DiceDomain(0)) - assert(v.intValue == 0) - - v.set(2)(d) - assert(v.value == DiceDomain(2)) - d.undo() - assert(v.value == DiceDomain(0)) - d.redo() - assert(v.value == DiceDomain(2)) - } - -} diff --git a/src/test/scala/cc/factorie/variable/TestEdgeVariable.scala b/src/test/scala/cc/factorie/variable/TestEdgeVariable.scala deleted file mode 100644 index e47ae23..0000000 --- a/src/test/scala/cc/factorie/variable/TestEdgeVariable.scala +++ /dev/null @@ -1,56 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.variable - -import org.junit.Test -import org.scalatest.junit._ - - -class TestEdgeVariable extends JUnitSuite with cc.factorie.util.FastLogging { - - @Test - def testEdgeVariable() { - - val i = new IntegerVariable(0) - val j = new IntegerVariable(0) - val k = new IntegerVariable(0) - - // Variable whose value is a Tuple2 of Scala pointers - val e = new EdgeVariable(i, j) - assert(e.src === i) - assert(e.dst === j) - - e.setSrc(k)(null) - assert(e.src === k) - - e.setDst(i)(null) - assert(e.dst === i) - } - - @Test - def testArrowVariable() { - val i = new IntegerVariable(0) - val j = new IntegerVariable(0) - val k = new IntegerVariable(0) - - // Variable whose value is a Tuple2 of Scala pointers, - // but you can only change the second of the pair - val a = new ArrowVariable(i, j) - - assert(a.src === i) - assert(a.dst === j) - - a.set(k)(null) - assert(a.dst === k) - } -} diff --git a/src/test/scala/cc/factorie/variable/TestEnumDomain.scala b/src/test/scala/cc/factorie/variable/TestEnumDomain.scala deleted file mode 100644 index f90320b..0000000 --- a/src/test/scala/cc/factorie/variable/TestEnumDomain.scala +++ /dev/null @@ -1,45 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.variable - -import org.junit.Test -import org.scalatest.junit._ - - -class TestEnumDomain extends JUnitSuite with cc.factorie.util.FastLogging { - - @Test - def testEnumDomain(): Unit = { - - object DiceDomain extends EnumDomain { val ONE, TWO, THREE, FOUR, FIVE, SIX = Value } - class DiceSample extends DiscreteVariable { def domain = DiceDomain } - - // size - assert(DiceDomain.size == 6) - - assert(DiceDomain.ONE == 0) - assert(DiceDomain.TWO == 1) - assert(DiceDomain.THREE == 2) - assert(DiceDomain.FOUR == 3) - assert(DiceDomain.FIVE == 4) - assert(DiceDomain.SIX == 5) - - assert(DiceDomain.category(0) == "ONE") - assert(DiceDomain.category(1) == "TWO") - assert(DiceDomain.category(2) == "THREE") - assert(DiceDomain.category(3) == "FOUR") - assert(DiceDomain.category(4) == "FIVE") - assert(DiceDomain.category(5) == "SIX") - } - -} diff --git a/src/test/scala/cc/factorie/variable/TestFeatureVectorVariable.scala b/src/test/scala/cc/factorie/variable/TestFeatureVectorVariable.scala deleted file mode 100644 index 356367b..0000000 --- a/src/test/scala/cc/factorie/variable/TestFeatureVectorVariable.scala +++ /dev/null @@ -1,56 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.variable - -import org.junit.Test -import org.scalatest.junit._ - - -class TestFeatureVectorVariable extends JUnitSuite with cc.factorie.util.FastLogging { - - @Test - def testBinaryFeatureVectorVariable(): Unit = { - - object DocumentDomain extends CategoricalVectorDomain[String] - assert(DocumentDomain.dimensionSize == 0) - - class Document extends BinaryFeatureVectorVariable[String] { - override def domain: CategoricalVectorDomain[String] = DocumentDomain - } - - val document = new Document - document += "hello" - document += "world" - document ++= Seq("a", "b", "c") - assert(DocumentDomain.dimensionSize == 5) - - println(DocumentDomain.stringToCategory("hello")) - println(DocumentDomain.stringToCategory("xyz")) - assert(DocumentDomain.dimensionSize == 5) - - assert(document.activeCategories.contains("hello")) - } - - @Test - def testFeatureVectorVariable(): Unit = { - object featureDomain extends CategoricalVectorDomain[String] - val v = new FeatureVectorVariable[String]() { - override def domain: CategoricalVectorDomain[String] = featureDomain - } - - v += "hello" - v += "world" - - println(v) - } -} diff --git a/src/test/scala/cc/factorie/variable/TestIntegerVariable.scala b/src/test/scala/cc/factorie/variable/TestIntegerVariable.scala deleted file mode 100644 index 09e5277..0000000 --- a/src/test/scala/cc/factorie/variable/TestIntegerVariable.scala +++ /dev/null @@ -1,42 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -package cc.factorie.variable - -import junit.framework._ - -/** @author Pallika Kanani */ -class TestIntegerVariable extends TestCase with cc.factorie.util.FastLogging { - - def testDiffLists(): Unit = { - val initialValue:Int = 100 - val v = new IntegerVariable(initialValue) - val d: DiffList = new DiffList() - logger.debug("Initial Value = " + v.intValue) - v.set(200)(d) - v.set(300)(d) - v.set(400)(d) - v.set(500)(d) - v.set(600)(d) - d.reverse.foreach( a => a.undo()) - assert(v.intValue == initialValue) - } -} - - -object TestIntegerVariable extends TestSuite { - addTestSuite(classOf[TestIntegerVariable]) - def main(args: Array[String]) { - junit.textui.TestRunner.run(this) - } -} diff --git a/src/test/scala/cc/factorie/variable/TestLabeledVariable.scala b/src/test/scala/cc/factorie/variable/TestLabeledVariable.scala deleted file mode 100644 index d9b5e8b..0000000 --- a/src/test/scala/cc/factorie/variable/TestLabeledVariable.scala +++ /dev/null @@ -1,76 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.variable - -import org.junit.Test -import org.scalatest.junit._ - - -class TestLabeledVariable extends JUnitSuite with cc.factorie.util.FastLogging { - - - // LabeledCategoricalVariable - @Test - def testLabeledCategoricalVariable(): Unit = { - object GenderDomain extends CategoricalDomain[String] { - value("male") - value("female") - freeze() - } - val v = new LabeledCategoricalVariable[String]("male") { - override def domain: CategoricalDomain[String] = GenderDomain - } - - assert(v.target.value == GenderDomain.value("male")) - - v.set(GenderDomain.value("female"))(null) - assert(v.value == GenderDomain.value("female")) - assert(!v.valueIsTarget) - - v.set(GenderDomain.value("male"))(null) - assert(v.value == GenderDomain.value("male")) - assert(v.valueIsTarget) - - } - - // LabeledIntegerVariable - @Test - def testLabeledIntegerVariable(): Unit = { - val v = new LabeledIntegerVariable(2) - v.set(0)(null) - - assert(v.intValue == 0) - assert(v.target.intValue == 2) - assert(!v.valueIsTarget) - - v.set(2)(null) - assert(v.intValue == 2) - assert(v.valueIsTarget) - } - - // LabeledBooleanVariable - @Test - def testLabeledBooleanVariable(): Unit = { - val v = new LabeledBooleanVariable(true) {} - assert(v.target.booleanValue) - - v.set(false)(null) - assert(!v.booleanValue) - assert(!v.valueIsTarget) - - v.set(true)(null) - assert(v.booleanValue) - assert(v.valueIsTarget) - } - -} diff --git a/src/test/scala/cc/factorie/variable/TestMassesVariable.scala b/src/test/scala/cc/factorie/variable/TestMassesVariable.scala deleted file mode 100644 index c1272f8..0000000 --- a/src/test/scala/cc/factorie/variable/TestMassesVariable.scala +++ /dev/null @@ -1,35 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.variable - -import org.junit.Assert._ -import org.junit.Test -import org.scalatest.junit._ - - -class TestMassesVariable extends JUnitSuite with cc.factorie.util.FastLogging { - - @Test - def testDenseMasses1(): Unit = { - var m = new DenseMasses1(4, 1.0) - assert(m.dim1 == 4) - assertEquals(1, m(0), 0.001) - assertEquals(0.25, m.pr(1), 0.001) - - m += (0, 1.0) - assertEquals(5, m.massTotal, 0.001) - assertEquals(2, m(0), 0.001) - assertEquals(0.4, m.pr(0), 0.001) - } - -} diff --git a/src/test/scala/cc/factorie/variable/TestProportionsVariable.scala b/src/test/scala/cc/factorie/variable/TestProportionsVariable.scala deleted file mode 100644 index 4999748..0000000 --- a/src/test/scala/cc/factorie/variable/TestProportionsVariable.scala +++ /dev/null @@ -1,71 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.variable - -import cc.factorie.util.DoubleSeq -import org.junit.Assert._ -import org.junit.Test -import org.scalatest.junit._ - - -class TestProportionsVariable extends JUnitSuite with cc.factorie.util.FastLogging { - - @Test - def testDenseProportions1(): Unit = { - val m1 = new DenseProportions1(4, 1) - assertEquals(0.25, m1.pr(0), 0.001) - - val m2 = new DenseProportions1(DoubleSeq(1.0, 1,1,1)) - assertEquals(0.25, m2.pr(0), 0.001) - - val m3 = new DenseProportions1(Array(1.0, 1,1,1)) - assertEquals(0.25, m3.pr(0), 0.001) - } - -} - -class TestGrowableDenseProportions1 extends JUnitSuite with cc.factorie.util.FastLogging { - - @Test - def testGrowableDenseProportions1(): Unit = { - object GrowableDomain extends CategoricalDomain[String] - val p = new GrowableDenseProportions1(GrowableDomain) - assert(p.size == 0) - - GrowableDomain.value("hello") - assert(p.size == 1) - assertEquals(1.0, p(0), 0.001) - - GrowableDomain.value("world") - assert(p.size == 2) - assertEquals(0.5, p(0), 0.001) - } -} - -class TestGrowableUniformProportions1 extends JUnitSuite { - - @Test - def testGrowableUniformProportions1(): Unit = { - object GrowableDomain extends CategoricalDomain[String] - val p = new GrowableUniformProportions1(GrowableDomain) - assert(p.size == 0) - - GrowableDomain.value("hello") - assert(p.size == 1) - assertEquals(1.0, p(0), 0.001) - - GrowableDomain.value("world") - assert(p.size == 2) - assertEquals(0.5, p(0), 0.001) - } -} \ No newline at end of file diff --git a/src/test/scala/cc/factorie/variable/TestSpanVariable.scala b/src/test/scala/cc/factorie/variable/TestSpanVariable.scala deleted file mode 100644 index 5382159..0000000 --- a/src/test/scala/cc/factorie/variable/TestSpanVariable.scala +++ /dev/null @@ -1,70 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.variable - -import cc.factorie.app.nlp._ -import cc.factorie.app.nlp.ner._ -import cc.factorie.app.nlp.segment.DeterministicNormalizingTokenizer -import junit.framework.Assert._ -import junit.framework._ - -class TestSpanVariable extends TestCase with cc.factorie.util.FastLogging { - - class MySpanBuffer extends SpanVarBuffer[TokenSpan,Section,Token] - - def testDiffLists(): Unit = { - val doc = load.LoadPlainText.fromString("aaa bb John Smith eee ff ggg").head - val sl = new MySpanBuffer - doc.attr += sl - - DeterministicNormalizingTokenizer.process(doc) - - //doc.foreach(logger.debug(_)) - assertEquals(7, doc.tokenCount) - val d = new DiffList - val s1 = new TokenSpan(doc.asSection, 1, 1) - doc.attr[MySpanBuffer].add(s1)(d) - assert(sl.head.start == 1) - //logger.debug("DiffList "+d) - //logger.debug("new span 1 1") - //logger.debug(doc.spans.mkString("\n")) - //logger.debug("DiffList "+d) - d.undo() - //logger.debug("undo") - //logger.debug("DiffList "+d) - //logger.debug(doc.spans.mkString("\n")) - assert(sl.length == 0) - val s2 = new ConllNerSpan(doc.asSection, 2, 2, "PER") - sl += s2 - assert(s2.string == "John Smith") - val s3 = new TokenSpan(doc.asSection, 4, 1) - sl += s3 - assert(sl.spansOfClass[NerSpan].length == 1) - val d2 = new DiffList - sl.remove(s3)(d2) - assert(sl.length == 1) - d2.undo() - assert(sl.length == 2) - sl.clear() - assert(sl.length == 0) - } - -} - - -object TestSpanVariable extends TestSuite { - addTestSuite(classOf[TestSpanVariable]) - def main(args: Array[String]) { - junit.textui.TestRunner.run(this) - } -} diff --git a/src/test/scala/cc/factorie/variable/TestVectorVariable.scala b/src/test/scala/cc/factorie/variable/TestVectorVariable.scala deleted file mode 100644 index 29d02a4..0000000 --- a/src/test/scala/cc/factorie/variable/TestVectorVariable.scala +++ /dev/null @@ -1,55 +0,0 @@ -/* Copyright (C) 2008-2016 University of Massachusetts Amherst. - This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) - http://factorie.cs.umass.edu, http://github.com/factorie - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -package cc.factorie.variable - -import cc.factorie.la.GrowableSparseBinaryTensor1 -import org.junit.Test -import org.scalatest.junit._ - - -class TestVectorVariable extends JUnitSuite with cc.factorie.util.FastLogging { - - val vectorDimensionDomain = new DiscreteDomain(3) - // a VectorDomain - val vectorDomain = new VectorDomain { - override type Value = BooleanValue - override def dimensionDomain: DiscreteDomain = vectorDimensionDomain - } - - @Test - def testVectorDomain(): Unit = { - - assert(vectorDomain.dimensionName(1) == "1") - assert(vectorDomain.dimensionSize == 3) - - // VectorDomain provides a proxy to freeze the underlying dimensionDomain - assert(!vectorDomain.dimensionDomain.frozen) - vectorDomain.freeze() - assert(vectorDomain.dimensionDomain.frozen) - } - - @Test - def testVectorVariable(): Unit = { - val v = new VectorVariable { - override def domain: VectorDomain = vectorDomain - - // VectorVariable does not specify how to save the value - set(new GrowableSparseBinaryTensor1(domain.dimensionDomain))(null) - } - - assert(!v.contains(0)) - v.update(0, 1.0)(null) - assert(v.contains(0)) - } - -} From da84fc518198964c6f62cbf01f2a1cf24b9979b1 Mon Sep 17 00:00:00 2001 From: andrewresearch Date: Sun, 29 Oct 2017 19:17:41 +1000 Subject: [PATCH 2/2] factorie-nlp-api-1 Basic config to support high level api --- .../factorie/app/nlp/pos/SpanishPosTag.scala | 66 +++++++++++++++++++ 1 file changed, 66 insertions(+) create mode 100644 src/main/scala/cc/factorie/app/nlp/pos/SpanishPosTag.scala diff --git a/src/main/scala/cc/factorie/app/nlp/pos/SpanishPosTag.scala b/src/main/scala/cc/factorie/app/nlp/pos/SpanishPosTag.scala new file mode 100644 index 0000000..f8611b7 --- /dev/null +++ b/src/main/scala/cc/factorie/app/nlp/pos/SpanishPosTag.scala @@ -0,0 +1,66 @@ +/* Copyright (C) 2008-2016 University of Massachusetts Amherst. + This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible) + http://factorie.cs.umass.edu, http://github.com/factorie + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +package cc.factorie.app.nlp.pos +import cc.factorie.app.nlp._ +import cc.factorie.variable._ + + + + + +/** A categorical variable, associated with a token, holding its Google Universal part-of-speech category, + which also separately holds its desired correct "target" value. */ +class LabeledUniversalPosTag(token:Token, targetValue:String) extends UniversalPosTag(token, targetValue) with CategoricalLabeling[String] + + +/** Penn Treebank part-of-speech tag domain. */ +object SpanishPosDomain extends CategoricalDomain[String] { + this ++= Vector( + "a", // adjective + "c", // conjunction + "d", // determiner + "f", // punctuation + "i", // interjection + "n", // noun + "p", // pronoun + "r", // adverb + "s", // preposition + "v", // verb + "w", // date + "z", // number + "_" // unknown + ) + freeze() + + def isNoun(pos:String): Boolean = pos(0) == 'n' +// def isProperNoun(pos:String) = { pos == "NNP" || pos == "NNPS" } + def isVerb(pos:String) = pos(0) == 'v' + def isAdjective(pos:String) = pos(0) == 'a' +// def isPersonalPronoun(pos: String) = pos == "PRP" +} +/** A categorical variable, associated with a token, holding its Penn Treebank part-of-speech category. */ +class SpanishPosTag(token:Token, initialIndex:Int) extends PosTag(token, initialIndex) { + def this(token:Token, initialCategory:String) = this(token, SpanishPosDomain.index(initialCategory)) + final def domain = SpanishPosDomain + def isNoun = SpanishPosDomain.isNoun(categoryValue) +// def isProperNoun = SpanishPosDomain.isProperNoun(categoryValue) + def isVerb = SpanishPosDomain.isVerb(categoryValue) + def isAdjective = SpanishPosDomain.isAdjective(categoryValue) +// def isPersonalPronoun = SpanishPosDomain.isPersonalPronoun(categoryValue) +} + +/** A categorical variable, associated with a token, holding its Spanish Treebank part-of-speech category, + which also separately holds its desired correct "target" value. */ +class LabeledSpanishPosTag(token:Token, targetValue:String) extends SpanishPosTag(token, targetValue) with CategoricalLabeling[String] +