textrank.go

package textrank

import (
	"github.com/DavidBelicza/TextRank/v2/convert"
	"github.com/DavidBelicza/TextRank/v2/parse"
	"github.com/DavidBelicza/TextRank/v2/rank"
)

// TextRank structure contains the Rank data object. This structure is a wrapper
// around the whole text ranking functionality.
type TextRank struct {
	rank *rank.Rank
}

// NewTextRank constructor retrieves a TextRank pointer. This is the 1th step to
// use TextRank.
func NewTextRank() *TextRank {
	return &TextRank{
		rank.NewRank(),
	}
}

// NewDefaultRule function retrieves a default Rule object what works in the
// most cases in English or similar Latin languages like French or Spanish. The
// Rule defines raw text how should be split to sentences and words. Because
// Rule is an interface it's possible modify the ranking by inject different
// Rule implementation. This is the 2nd step to use TextRank.
func NewDefaultRule() *parse.RuleDefault {
	return parse.NewRule()
}

// NewDefaultLanguage function retrieves a default Language object. It defines
// what words are real and what words are just Stop Words or useless Junk Words.
// It uses the default English Stop Words, but it's possible to set different
// Stop Words in English or any other languages. Because Language is an
// interface it's possible to modify the ranking by inject different Language
// implementation. This is the 3rd step to use TextRank.
func NewDefaultLanguage() *convert.LanguageDefault {
	return convert.NewLanguage()
}

// NewDefaultAlgorithm function retrieves an Algorithm object. It defines how
// should work the text ranking algorithm, the weighting. This is the general
// text rank by weighting the connection between the words to find the strongest
// phrases. Because Algorithm is an interface it's possible to modify the
// ranking algorithm by inject different implementation. This is the 4th step to
// use TextRank.
func NewDefaultAlgorithm() *rank.AlgorithmDefault {
	return rank.NewAlgorithmDefault()
}

// NewChainAlgorithm function retrieves an Algorithm object. It defines how
// should work the text ranking algorithm, the weighting. This is an alternative
// way to ranking words by weighting the number of the words. Because Algorithm
// is an interface it's possible to modify the ranking algorithm by inject
// different implementation. This is  the 4th step to use TextRank.
func NewChainAlgorithm() *rank.AlgorithmChain {
	return rank.NewAlgorithmChain()
}

// Populate method adds a raw text to the text-ranking graph. It parses,
// tokenize the raw text and prepares it to weighting and scoring. It's possible
// to append a new raw text to an existing one even if the previously text is
// already ranked. This is 5th step to use TextRank.
//
// text string must be a plain text from TXT or PDF or any document, it can
// contain new lines, break lines or any unnecessary text parts, but it should
// not contain HTML tags or codes.
//
// lang Language object can be loaded from NewDefaultLanguage function.
//
// rule Rule object can be loaded from NewDefaultRule function.
func (textRank *TextRank) Populate(
	text string,
	lang convert.Language,
	rule parse.Rule,
) {
	parsedText := parse.TokenizeText(text, rule)

	for _, sentence := range parsedText.GetSentences() {
		convert.TextToRank(sentence, lang, textRank.rank)
	}
}

// Ranking method counts the words and connections between the words, then it
// weights the numbers then normalize them in type float32 between 0.00 and
// 1.00. This is the 6th step to use TextRank.
//
// algorithm Algorithm is the object of the weighting and scoring methods.
func (textRank *TextRank) Ranking(algorithm rank.Algorithm) {
	rank.Calculate(textRank.rank, algorithm)
}

// GetRankData method retrieves the Rank data to that case if the developer want
// access to the whole graph and sentences, words, weights and all of the data
// to analyze it or just implement a new search logic or finder method.
func (textRank *TextRank) GetRankData() *rank.Rank {
	return textRank.rank
}

// FindPhrases function retrieves a slice of Phrase structures by TextRank
// object. The return value contains the sorted phrases with IDs, words, weights
// and quantities by weight from 1 to 0. Weight is calculated from quantities of
// relation between two words. A single phrase is from two words - not less and
// more. (But it's possible to find chain of phrases by
// FindSentencesByPhraseChain function.)
func FindPhrases(textRank *TextRank) []rank.Phrase {
	return rank.FindPhrases(textRank.rank)
}

// FindSingleWords function retrieves a slice of SingleWord structures by
// TextRank object. The return value contains the sorted words with IDs, words,
// weights and quantities by weight from 1 to 0. Weight is calculated from
// quantities of word.
func FindSingleWords(textRank *TextRank) []rank.SingleWord {
	return rank.FindSingleWords(textRank.rank)
}

// FindSentencesByRelationWeight function retrieves a slice of Sentence
// structures by TextRank object. The return value contains the ID of the
// sentence and the sentence text itself. The slice is sorted by weight of
// phrases from 1 to 0.
func FindSentencesByRelationWeight(
	textRank *TextRank,
	limit int,
) []rank.Sentence {

	return rank.FindSentences(textRank.rank, rank.ByRelation, limit)
}

// FindSentencesByWordQtyWeight function retrieves a slice of Sentence
// structures by TextRank object. The return value contains the ID of the
// sentence and the sentence text itself. The slice is sorted by weight of word
// quantities from 1 to 0.
func FindSentencesByWordQtyWeight(
	textRank *TextRank,
	limit int,
) []rank.Sentence {

	return rank.FindSentences(textRank.rank, rank.ByQty, limit)
}

// FindSentencesByPhraseChain function retrieves a slice of Sentence structures
// by TextRank object and slice of phrases. The return value contains the ID of
// the sentence and the sentence text itself. The slice is sorted by weight of
// word quantities from 1 to 0.
//
// textRank TextRank is the object of the TextRank.
//
// phrases []string is a slice of phrases. A single phrase is from two words, so
// when the slice contains 3 words the inner method will search for two phrases.
// The search algorithm seeks for "len(phrases)!". In case of three item the
// possible combination is 3 factorial (3!) = 3 * 2 * 1.
//
//    rawText := "Long raw text, lorem ipsum..."
//    rule := NewDefaultRule()
//    language := NewDefaultLanguage()
//    algorithm := NewDefaultAlgorithm()
//
//    Append(rawText, language, rule, 1)
//    Ranking(1, algorithm)
//
//    FindSentencesByPhraseChain(1, []string{
//        "captain",
//        "james",
//        "kirk",
//    })
//
// The above code searches for captain james kirk, captain kirk james, james
// kirk captain, james captain kirk, kirk james captain and james kirk captain
// combinations in the graph. The 3 of words have to be related to each other
// in the same sentence but the search algorithm ignores the stop words. So if
// there is a sentence "James Kirk is the Captain of the Enterprise." the
// sentence will be returned because the words "is" and "the" are stop words.
func FindSentencesByPhraseChain(
	textRank *TextRank,
	phrases []string,
) []rank.Sentence {

	return rank.FindSentencesByPhrases(textRank.rank, phrases)
}

// FindSentencesFrom function retrieves a slice of Sentence structures by
// TextRank object and by ID of the sentence. The return value contains the
// sentence text itself. The returned slice contains sentences sorted by their
// IDs started from the given sentence ID in ascending sort.
func FindSentencesFrom(
	textRank *TextRank,
	sentenceID int,
	limit int,
) []rank.Sentence {

	return rank.FindSentencesFrom(textRank.rank, sentenceID, limit)
}