Pattern(pattern)
+
Class for representing a pattern (a regular expression).
-
Initialize a Pattern.
+
+
Parameters:
-
- pattern
+
pattern
(str | Pattern
)
–
@@ -153,70 +154,46 @@
+
-
- filter
+
+ filter
-filter(strings, labels = None)
+
-
Return strings and labels which match the pattern and which don't.
+
Return a boolean mask for strings matching the pattern.
+
+
Parameters:
-
- strings
+
strings
(list[str]
)
–
- -
- labels
- (
list[int]
)
- –
-
-
+
+
Returns:
-
-
matched_strings
- –
-
-
Strings which match the pattern.
-
-
- -
-
labels_of_matched
+ matches
–
-
Labels of strings which match the pattern. If no labels provided, an empty list.
-
-
- -
-
not_matched_strings
- –
-
-
Strings which don't match the pattern.
-
-
- -
-
labels_of_not_matched
- –
-
-
Labels of strings which don't match the pattern. If no labels provided, an empty list.
+
Numpy array of bool type.
@@ -225,16 +202,17 @@
+
-
- match
+
+ match
-match(string)
+
@@ -245,16 +223,17 @@
+
-
- scores
+
+ scores
-scores(strings, labels)
+scores(strings, labels, full_labels_list=None)
@@ -262,10 +241,12 @@
Calculate classification quality scores for pattern's matches and the labels.
Pattern's matches are considered as "predictions" when calculating quality metrics.
+
+
Parameters:
-
- strings
+
strings
(list[str]
)
–
@@ -273,22 +254,35 @@
-
- labels
+
labels
(list[int]
)
–
-
A list of strings labels consisting of 0 and 1.
+
A list of strings labels.
+
+
+ -
+
full_labels_list
+ (list[int]
, default:
+ None
+)
+ –
+
+
A full list of unique labels that can be present among labels.
+
+
Returns:
-
-response(
dict
+response
( dict
) –
-
Contains keys: 'n_strings', 'total_positive', 'n_matches', 'precision', 'recall' and 'accuracy'.
+
Contains keys: 'n_strings', 'total_positive', 'n_matches', 'precision', 'recall' and 'accuracy'.
+If there are more than 2 classes, each key is a list corresponding to each class.
@@ -303,6 +297,7 @@
+
diff --git a/site/patternnode_reference/index.html b/site/patternnode_reference/index.html
index 117b3ec..8bfdafd 100644
--- a/site/patternnode_reference/index.html
+++ b/site/patternnode_reference/index.html
@@ -89,43 +89,43 @@
- PatternNode
- - get_labels()
+
- get_labels
- - get_matches()
+
- get_matches
- - get_pattern()
+
- get_pattern
- - get_scores()
+
- get_scores
- - get_strings()
+
- get_strings
- - set_labels()
+
- set_labels
- - set_matches()
+
- set_matches
- - set_pattern()
+
- set_pattern
- - set_scores()
+
- set_scores
- - set_strings()
+
- set_strings
@@ -140,12 +140,12 @@
-
- PatternNode
+
+ PatternNode
-PatternNode(pattern: Pattern)
+PatternNode(pattern: Pattern)
@@ -153,10 +153,12 @@
Class representing a node in a StringTree.
+
+
Attributes:
-
- right
+
right
(PatternNode
)
–
@@ -164,7 +166,7 @@
-
- left
+
left
(PatternNode
)
–
@@ -172,7 +174,7 @@
-
- pattern
+
pattern
(Pattern
)
–
@@ -180,42 +182,43 @@
-
- matches
+
matches
–
Strings which match a Pattern object attributed to the PatternNode.
-
- strings
+
strings
–
All strings attributed to the PatternNode.
-
- labels
+
labels
–
Labels of all strings attributed to the PatternNode.
-
- scores
+
scores
–
Scores of an attributed pattern applied to the attributed strings.
-
Initialize a PatternNode.
+
+
Parameters:
-
- pattern
+
pattern
(Pattern
)
–
@@ -237,16 +240,17 @@
+
-
- get_labels
+
+ get_labels
-get_labels()
+
@@ -257,16 +261,17 @@
+
-
- get_matches
+
+ get_matches
-get_matches()
+
@@ -277,16 +282,17 @@
+
-
- get_pattern
+
+ get_pattern
-get_pattern()
+
@@ -297,16 +303,17 @@
+
-
- get_scores
+
+ get_scores
-get_scores()
+
@@ -317,16 +324,17 @@
+
-
- get_strings
+
+ get_strings
-get_strings()
+
@@ -337,16 +345,17 @@
+
-
- set_labels
+
+ set_labels
-set_labels(labels)
+
@@ -357,16 +366,17 @@
+
-
- set_matches
+
+ set_matches
-set_matches(matches)
+
@@ -377,16 +387,17 @@
+
-
- set_pattern
+
+ set_pattern
-set_pattern(pattern)
+
@@ -397,16 +408,17 @@
+
-
- set_scores
+
+ set_scores
-set_scores(scores)
+
@@ -417,16 +429,17 @@
+
-
- set_strings
+
+ set_strings
-set_strings(strings)
+
@@ -443,6 +456,7 @@
+
diff --git a/site/search/search_index.json b/site/search/search_index.json
index 61874bf..72f1d4b 100644
--- a/site/search/search_index.json
+++ b/site/search/search_index.json
@@ -1 +1 @@
-{"config":{"indexing":"full","lang":["en"],"min_search_length":3,"prebuild_index":false,"separator":"[\\s\\-]+"},"docs":[{"location":"","text":"Basics to strtree strtree is a Python package for strings binary classification, based on regular expressions put in a decision tree. Github repo: stretree With strtree you can: Do a binary classification of your strings using automatically extracted regular expressions Find shortest regular expressions which covers strings with positive labels in the most accurate way Look at a quick example. Example Firstly, let's build a tree from strings and their labels. import strtree strings = ['Samsung X-500', 'Samsung SM-10', 'Samsung X-1100', 'Samsung F-10', 'Samsung X-2200', 'AB Nokia 1', 'DG Nokia 2', 'THGF Nokia 3', 'SFSD Nokia 4', 'Nokia XG', 'Nokia YO'] labels = [1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0] tree = StringTree() tree.build(strings, labels, min_precision=0.75, min_token_length=1) Let's see what regular expressions were extracted. for leaf in tree.leaves: print(leaf) # Output: # PatternNode(\".+ .+a.+\", right=None, left=PatternNode(.+0.+), n_strings=11, precision=1.0, recall=0.57) # PatternNode(\".+0.+\", right=None, left=None, n_strings=7, precision=1.0, recall=1.0) You may need to check the precision and recall of the whole tree for a given set of strings and true labels. print('Precision: {}'.format(tree.precision_score(strings, labels))) # Precision: 1.0 print('Recall: {}'.format(tree.precision_score(strings, labels))) # Recall: 1.0 Finally, you can pass any strings you want and see if they match to extracted regular expressions or not. matches = tree.match(other_strings) # You will receive a vector of the same size as other_strings containing 0's (no match) or 1's (match) Installing Use PyPI: pip install strtree Use a distribution file located in the dist folder: pip install strtree-0.1.0-py3-none-any.whl Contribution You are very welcome to participate in the project. You may solve the current issues or add new functionality - it is up to you to.","title":"Overview"},{"location":"#basics-to-strtree","text":"strtree is a Python package for strings binary classification, based on regular expressions put in a decision tree. Github repo: stretree With strtree you can: Do a binary classification of your strings using automatically extracted regular expressions Find shortest regular expressions which covers strings with positive labels in the most accurate way Look at a quick example.","title":"Basics to strtree"},{"location":"#example","text":"Firstly, let's build a tree from strings and their labels. import strtree strings = ['Samsung X-500', 'Samsung SM-10', 'Samsung X-1100', 'Samsung F-10', 'Samsung X-2200', 'AB Nokia 1', 'DG Nokia 2', 'THGF Nokia 3', 'SFSD Nokia 4', 'Nokia XG', 'Nokia YO'] labels = [1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0] tree = StringTree() tree.build(strings, labels, min_precision=0.75, min_token_length=1) Let's see what regular expressions were extracted. for leaf in tree.leaves: print(leaf) # Output: # PatternNode(\".+ .+a.+\", right=None, left=PatternNode(.+0.+), n_strings=11, precision=1.0, recall=0.57) # PatternNode(\".+0.+\", right=None, left=None, n_strings=7, precision=1.0, recall=1.0) You may need to check the precision and recall of the whole tree for a given set of strings and true labels. print('Precision: {}'.format(tree.precision_score(strings, labels))) # Precision: 1.0 print('Recall: {}'.format(tree.precision_score(strings, labels))) # Recall: 1.0 Finally, you can pass any strings you want and see if they match to extracted regular expressions or not. matches = tree.match(other_strings) # You will receive a vector of the same size as other_strings containing 0's (no match) or 1's (match)","title":"Example"},{"location":"#installing","text":"Use PyPI: pip install strtree Use a distribution file located in the dist folder: pip install strtree-0.1.0-py3-none-any.whl","title":"Installing"},{"location":"#contribution","text":"You are very welcome to participate in the project. You may solve the current issues or add new functionality - it is up to you to.","title":"Contribution"},{"location":"pattern_reference/","text":"Pattern Pattern ( pattern ) Class for representing a pattern (a regular expression). Initialize a Pattern. Parameters: pattern ( str | Pattern ) \u2013 Str or Pattern object representing a regular expressions. It must be compatible with re.compile method. filter filter ( strings , labels = None ) Return strings and labels which match the pattern and which don't. Parameters: strings ( list [ str ] ) \u2013 Strings to filter. labels ( list [ int ] ) \u2013 Labels of strings. Returns: matched_strings \u2013 Strings which match the pattern. labels_of_matched \u2013 Labels of strings which match the pattern. If no labels provided, an empty list. not_matched_strings \u2013 Strings which don't match the pattern. labels_of_not_matched \u2013 Labels of strings which don't match the pattern. If no labels provided, an empty list. match match ( string ) Verify if the pattern matches the string (at any place). scores scores ( strings , labels ) Calculate classification quality scores for pattern's matches and the labels. Pattern's matches are considered as \"predictions\" when calculating quality metrics. Parameters: strings ( list [ str ] ) \u2013 A list of strings. labels ( list [ int ] ) \u2013 A list of strings labels consisting of 0 and 1. Returns: response ( dict ) \u2013 Contains keys: 'n_strings', 'total_positive', 'n_matches', 'precision', 'recall' and 'accuracy'.","title":"Pattern"},{"location":"pattern_reference/#utils.Pattern","text":"Pattern ( pattern ) Class for representing a pattern (a regular expression). Initialize a Pattern. Parameters: pattern ( str | Pattern ) \u2013 Str or Pattern object representing a regular expressions. It must be compatible with re.compile method.","title":"Pattern"},{"location":"pattern_reference/#utils.Pattern.filter","text":"filter ( strings , labels = None ) Return strings and labels which match the pattern and which don't. Parameters: strings ( list [ str ] ) \u2013 Strings to filter. labels ( list [ int ] ) \u2013 Labels of strings. Returns: matched_strings \u2013 Strings which match the pattern. labels_of_matched \u2013 Labels of strings which match the pattern. If no labels provided, an empty list. not_matched_strings \u2013 Strings which don't match the pattern. labels_of_not_matched \u2013 Labels of strings which don't match the pattern. If no labels provided, an empty list.","title":"filter()"},{"location":"pattern_reference/#utils.Pattern.match","text":"match ( string ) Verify if the pattern matches the string (at any place).","title":"match()"},{"location":"pattern_reference/#utils.Pattern.scores","text":"scores ( strings , labels ) Calculate classification quality scores for pattern's matches and the labels. Pattern's matches are considered as \"predictions\" when calculating quality metrics. Parameters: strings ( list [ str ] ) \u2013 A list of strings. labels ( list [ int ] ) \u2013 A list of strings labels consisting of 0 and 1. Returns: response ( dict ) \u2013 Contains keys: 'n_strings', 'total_positive', 'n_matches', 'precision', 'recall' and 'accuracy'.","title":"scores()"},{"location":"patternnode_reference/","text":"PatternNode PatternNode ( pattern : Pattern ) Class representing a node in a StringTree. Attributes: right ( PatternNode ) \u2013 Node with matching strings. left ( PatternNode ) \u2013 Node with non-matching strings. pattern ( Pattern ) \u2013 Attributed Pattern object. matches \u2013 Strings which match a Pattern object attributed to the PatternNode. strings \u2013 All strings attributed to the PatternNode. labels \u2013 Labels of all strings attributed to the PatternNode. scores \u2013 Scores of an attributed pattern applied to the attributed strings. Initialize a PatternNode. Parameters: pattern ( Pattern ) \u2013 A Pattern object attributed to a node. get_labels get_labels () Get PatternNode.labels attribute get_matches get_matches () Get PatternNode.matches attribute get_pattern get_pattern () Get PatternNode.pattern attribute get_scores get_scores () Get PatternNode.scores attribute get_strings get_strings () Get PatternNode.strings attribute set_labels set_labels ( labels ) Set PatternNode.labels attribute set_matches set_matches ( matches ) Set PatternNode.matches attribute set_pattern set_pattern ( pattern ) Set PatternNode.pattern attribute set_scores set_scores ( scores ) Set PatternNode.scores attribute set_strings set_strings ( strings ) Set PatternNode.strings attribute","title":"PatternNode"},{"location":"patternnode_reference/#utils.PatternNode","text":"PatternNode ( pattern : Pattern ) Class representing a node in a StringTree. Attributes: right ( PatternNode ) \u2013 Node with matching strings. left ( PatternNode ) \u2013 Node with non-matching strings. pattern ( Pattern ) \u2013 Attributed Pattern object. matches \u2013 Strings which match a Pattern object attributed to the PatternNode. strings \u2013 All strings attributed to the PatternNode. labels \u2013 Labels of all strings attributed to the PatternNode. scores \u2013 Scores of an attributed pattern applied to the attributed strings. Initialize a PatternNode. Parameters: pattern ( Pattern ) \u2013 A Pattern object attributed to a node.","title":"PatternNode"},{"location":"patternnode_reference/#utils.PatternNode.get_labels","text":"get_labels () Get PatternNode.labels attribute","title":"get_labels()"},{"location":"patternnode_reference/#utils.PatternNode.get_matches","text":"get_matches () Get PatternNode.matches attribute","title":"get_matches()"},{"location":"patternnode_reference/#utils.PatternNode.get_pattern","text":"get_pattern () Get PatternNode.pattern attribute","title":"get_pattern()"},{"location":"patternnode_reference/#utils.PatternNode.get_scores","text":"get_scores () Get PatternNode.scores attribute","title":"get_scores()"},{"location":"patternnode_reference/#utils.PatternNode.get_strings","text":"get_strings () Get PatternNode.strings attribute","title":"get_strings()"},{"location":"patternnode_reference/#utils.PatternNode.set_labels","text":"set_labels ( labels ) Set PatternNode.labels attribute","title":"set_labels()"},{"location":"patternnode_reference/#utils.PatternNode.set_matches","text":"set_matches ( matches ) Set PatternNode.matches attribute","title":"set_matches()"},{"location":"patternnode_reference/#utils.PatternNode.set_pattern","text":"set_pattern ( pattern ) Set PatternNode.pattern attribute","title":"set_pattern()"},{"location":"patternnode_reference/#utils.PatternNode.set_scores","text":"set_scores ( scores ) Set PatternNode.scores attribute","title":"set_scores()"},{"location":"patternnode_reference/#utils.PatternNode.set_strings","text":"set_strings ( strings ) Set PatternNode.strings attribute","title":"set_strings()"},{"location":"stringtree_reference/","text":"StringTree StringTree () A class for binary classification of strings with regular expressions. Each node is an instance of the PatternNode class. It contains a regular expression and metadata. Attributes: root ( PatternNode ) \u2013 The root PatternNode. leaves ( list [ PatternNode ] ) \u2013 List of all nodes. Initialize a StringTree object. build build ( strings , labels , min_precision = 0.5 , min_token_length = 1 , max_patterns = None , min_matches_leaf = 1 , min_strings_leaf = 1 , verbose = False ) Build a StringTree. For the StringTree object being used, create nodes and corresponding patterns. Use provided strings and labels. Parameters: strings ( list [ str ] ) \u2013 List of strings. labels ( list [ int ] ) \u2013 List of labels (0 or 1). min_precision ( float ) \u2013 The minimal precision of a pattern in the tree. min_token_length ( int ) \u2013 The initial length of the pattern. max_patterns ( int ) \u2013 The highest amount of patterns. Once the method finds more, it stops. min_matches_leaf ( int ) \u2013 The minimal amount of matches in one node. min_strings_leaf ( int ) \u2013 The minimal amount of strings in one node. verbose ( bool ) \u2013 If to provide additinal text output. filter filter ( strings , return_nodes = False ) Return strings matching the tree and corresponding nodes. A string matches a tree if it matches at least one node. Parameters: strings ( list [ str ] ) \u2013 List of strings. return_nodes ( bool ) \u2013 Flag indicating if to return nodes corresponding to the matched strings. If False, only matched strings are returned. Returns: matches ( list [ int ] ) \u2013 List containing matching strings. matched_nodes ( list [ PatternNode ] ) \u2013 List consisting of PatternNodes of matching strings. Returned only if return_nodes is True. get_leaves get_leaves () Get leaves attribute. match match ( strings , return_nodes = False ) Return flags indicating if strings match the tree. A string matches a tree if it matches at least one node. Parameters: strings ( list [ str ] ) \u2013 List of strings. return_nodes ( bool ) \u2013 Flag indicating if to return nodes corresponding to the matched strings. If False, only matched strings are returned. Returns: matches ( list [ int ] ) \u2013 List containing 1 (match) and 0 (no match) for each string. matched_nodes ( list [ PatternNode ] ) \u2013 List consisting of PatternNodes of matching strings. Returned only if return_nodes is True. precision_score precision_score ( strings , labels ) Calculate a precision score for given strings and labels. recall_score recall_score ( strings , labels ) Calculate a recall score for given strings and labels. set_leaves set_leaves ( leaves ) Set leaves attribute.","title":"StringTree"},{"location":"stringtree_reference/#utils.StringTree","text":"StringTree () A class for binary classification of strings with regular expressions. Each node is an instance of the PatternNode class. It contains a regular expression and metadata. Attributes: root ( PatternNode ) \u2013 The root PatternNode. leaves ( list [ PatternNode ] ) \u2013 List of all nodes. Initialize a StringTree object.","title":"StringTree"},{"location":"stringtree_reference/#utils.StringTree.build","text":"build ( strings , labels , min_precision = 0.5 , min_token_length = 1 , max_patterns = None , min_matches_leaf = 1 , min_strings_leaf = 1 , verbose = False ) Build a StringTree. For the StringTree object being used, create nodes and corresponding patterns. Use provided strings and labels. Parameters: strings ( list [ str ] ) \u2013 List of strings. labels ( list [ int ] ) \u2013 List of labels (0 or 1). min_precision ( float ) \u2013 The minimal precision of a pattern in the tree. min_token_length ( int ) \u2013 The initial length of the pattern. max_patterns ( int ) \u2013 The highest amount of patterns. Once the method finds more, it stops. min_matches_leaf ( int ) \u2013 The minimal amount of matches in one node. min_strings_leaf ( int ) \u2013 The minimal amount of strings in one node. verbose ( bool ) \u2013 If to provide additinal text output.","title":"build()"},{"location":"stringtree_reference/#utils.StringTree.filter","text":"filter ( strings , return_nodes = False ) Return strings matching the tree and corresponding nodes. A string matches a tree if it matches at least one node. Parameters: strings ( list [ str ] ) \u2013 List of strings. return_nodes ( bool ) \u2013 Flag indicating if to return nodes corresponding to the matched strings. If False, only matched strings are returned. Returns: matches ( list [ int ] ) \u2013 List containing matching strings. matched_nodes ( list [ PatternNode ] ) \u2013 List consisting of PatternNodes of matching strings. Returned only if return_nodes is True.","title":"filter()"},{"location":"stringtree_reference/#utils.StringTree.get_leaves","text":"get_leaves () Get leaves attribute.","title":"get_leaves()"},{"location":"stringtree_reference/#utils.StringTree.match","text":"match ( strings , return_nodes = False ) Return flags indicating if strings match the tree. A string matches a tree if it matches at least one node. Parameters: strings ( list [ str ] ) \u2013 List of strings. return_nodes ( bool ) \u2013 Flag indicating if to return nodes corresponding to the matched strings. If False, only matched strings are returned. Returns: matches ( list [ int ] ) \u2013 List containing 1 (match) and 0 (no match) for each string. matched_nodes ( list [ PatternNode ] ) \u2013 List consisting of PatternNodes of matching strings. Returned only if return_nodes is True.","title":"match()"},{"location":"stringtree_reference/#utils.StringTree.precision_score","text":"precision_score ( strings , labels ) Calculate a precision score for given strings and labels.","title":"precision_score()"},{"location":"stringtree_reference/#utils.StringTree.recall_score","text":"recall_score ( strings , labels ) Calculate a recall score for given strings and labels.","title":"recall_score()"},{"location":"stringtree_reference/#utils.StringTree.set_leaves","text":"set_leaves ( leaves ) Set leaves attribute.","title":"set_leaves()"}]}
\ No newline at end of file
+{"config":{"indexing":"full","lang":["en"],"min_search_length":3,"prebuild_index":false,"separator":"[\\s\\-]+"},"docs":[{"location":"","text":"Basics to strtree strtree is a Python package for strings binary classification, based on regular expressions put in a decision tree. Github repo: stretree With strtree you can: Do a binary classification of your strings using automatically extracted regular expressions Find shortest regular expressions which covers strings with positive labels in the most accurate way Look at a quick example. Example Firstly, let's build a tree from strings and their labels. import strtree strings = ['Samsung X-500', 'Samsung SM-10', 'Samsung X-1100', 'Samsung F-10', 'Samsung X-2200', 'AB Nokia 1', 'DG Nokia 2', 'THGF Nokia 3', 'SFSD Nokia 4', 'Nokia XG', 'Nokia YO'] labels = [1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0] tree = StringTree() tree.build(strings, labels, min_precision=0.75, min_token_length=1) Let's see what regular expressions were extracted. for leaf in tree.leaves: print(leaf) # Output: # PatternNode(\".+ .+a.+\", right=None, left=PatternNode(.+0.+), n_strings=11, precision=1.0, recall=0.57) # PatternNode(\".+0.+\", right=None, left=None, n_strings=7, precision=1.0, recall=1.0) You may need to check the precision and recall of the whole tree for a given set of strings and true labels. print('Precision: {}'.format(tree.precision_score(strings, labels))) # Precision: 1.0 print('Recall: {}'.format(tree.precision_score(strings, labels))) # Recall: 1.0 Finally, you can pass any strings you want and see if they match to extracted regular expressions or not. matches = tree.match(other_strings) # You will receive a vector of the same size as other_strings containing 0's (no match) or 1's (match) Installing Use PyPI: pip install strtree Use a distribution file located in the dist folder: pip install strtree-0.1.0-py3-none-any.whl Contribution You are very welcome to participate in the project. You may solve the current issues or add new functionality - it is up to you to.","title":"Overview"},{"location":"#basics-to-strtree","text":"strtree is a Python package for strings binary classification, based on regular expressions put in a decision tree. Github repo: stretree With strtree you can: Do a binary classification of your strings using automatically extracted regular expressions Find shortest regular expressions which covers strings with positive labels in the most accurate way Look at a quick example.","title":"Basics to strtree"},{"location":"#example","text":"Firstly, let's build a tree from strings and their labels. import strtree strings = ['Samsung X-500', 'Samsung SM-10', 'Samsung X-1100', 'Samsung F-10', 'Samsung X-2200', 'AB Nokia 1', 'DG Nokia 2', 'THGF Nokia 3', 'SFSD Nokia 4', 'Nokia XG', 'Nokia YO'] labels = [1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0] tree = StringTree() tree.build(strings, labels, min_precision=0.75, min_token_length=1) Let's see what regular expressions were extracted. for leaf in tree.leaves: print(leaf) # Output: # PatternNode(\".+ .+a.+\", right=None, left=PatternNode(.+0.+), n_strings=11, precision=1.0, recall=0.57) # PatternNode(\".+0.+\", right=None, left=None, n_strings=7, precision=1.0, recall=1.0) You may need to check the precision and recall of the whole tree for a given set of strings and true labels. print('Precision: {}'.format(tree.precision_score(strings, labels))) # Precision: 1.0 print('Recall: {}'.format(tree.precision_score(strings, labels))) # Recall: 1.0 Finally, you can pass any strings you want and see if they match to extracted regular expressions or not. matches = tree.match(other_strings) # You will receive a vector of the same size as other_strings containing 0's (no match) or 1's (match)","title":"Example"},{"location":"#installing","text":"Use PyPI: pip install strtree Use a distribution file located in the dist folder: pip install strtree-0.1.0-py3-none-any.whl","title":"Installing"},{"location":"#contribution","text":"You are very welcome to participate in the project. You may solve the current issues or add new functionality - it is up to you to.","title":"Contribution"},{"location":"pattern_reference/","text":"Pattern Pattern ( pattern ) Class for representing a pattern (a regular expression). Initialize a Pattern. Parameters: pattern ( str | Pattern ) \u2013 Str or Pattern object representing a regular expressions. It must be compatible with re.compile method. filter filter ( strings ) Return a boolean mask for strings matching the pattern. Parameters: strings ( list [ str ] ) \u2013 Strings to filter. Returns: matches \u2013 Numpy array of bool type. match match ( string ) Verify if the pattern matches the string (at any place). scores scores ( strings , labels , full_labels_list = None ) Calculate classification quality scores for pattern's matches and the labels. Pattern's matches are considered as \"predictions\" when calculating quality metrics. Parameters: strings ( list [ str ] ) \u2013 A list of strings. labels ( list [ int ] ) \u2013 A list of strings labels. full_labels_list ( list [ int ] , default: None ) \u2013 A full list of unique labels that can be present among labels. Returns: response ( dict ) \u2013 Contains keys: 'n_strings', 'total_positive', 'n_matches', 'precision', 'recall' and 'accuracy'. If there are more than 2 classes, each key is a list corresponding to each class.","title":"Pattern"},{"location":"pattern_reference/#utils.Pattern","text":"Pattern ( pattern ) Class for representing a pattern (a regular expression). Initialize a Pattern. Parameters: pattern ( str | Pattern ) \u2013 Str or Pattern object representing a regular expressions. It must be compatible with re.compile method.","title":"Pattern"},{"location":"pattern_reference/#utils.Pattern.filter","text":"filter ( strings ) Return a boolean mask for strings matching the pattern. Parameters: strings ( list [ str ] ) \u2013 Strings to filter. Returns: matches \u2013 Numpy array of bool type.","title":"filter"},{"location":"pattern_reference/#utils.Pattern.match","text":"match ( string ) Verify if the pattern matches the string (at any place).","title":"match"},{"location":"pattern_reference/#utils.Pattern.scores","text":"scores ( strings , labels , full_labels_list = None ) Calculate classification quality scores for pattern's matches and the labels. Pattern's matches are considered as \"predictions\" when calculating quality metrics. Parameters: strings ( list [ str ] ) \u2013 A list of strings. labels ( list [ int ] ) \u2013 A list of strings labels. full_labels_list ( list [ int ] , default: None ) \u2013 A full list of unique labels that can be present among labels. Returns: response ( dict ) \u2013 Contains keys: 'n_strings', 'total_positive', 'n_matches', 'precision', 'recall' and 'accuracy'. If there are more than 2 classes, each key is a list corresponding to each class.","title":"scores"},{"location":"patternnode_reference/","text":"PatternNode PatternNode ( pattern : Pattern ) Class representing a node in a StringTree. Attributes: right ( PatternNode ) \u2013 Node with matching strings. left ( PatternNode ) \u2013 Node with non-matching strings. pattern ( Pattern ) \u2013 Attributed Pattern object. matches \u2013 Strings which match a Pattern object attributed to the PatternNode. strings \u2013 All strings attributed to the PatternNode. labels \u2013 Labels of all strings attributed to the PatternNode. scores \u2013 Scores of an attributed pattern applied to the attributed strings. Initialize a PatternNode. Parameters: pattern ( Pattern ) \u2013 A Pattern object attributed to a node. get_labels get_labels () Get PatternNode.labels attribute get_matches get_matches () Get PatternNode.matches attribute get_pattern get_pattern () Get PatternNode.pattern attribute get_scores get_scores () Get PatternNode.scores attribute get_strings get_strings () Get PatternNode.strings attribute set_labels set_labels ( labels ) Set PatternNode.labels attribute set_matches set_matches ( matches ) Set PatternNode.matches attribute set_pattern set_pattern ( pattern ) Set PatternNode.pattern attribute set_scores set_scores ( scores ) Set PatternNode.scores attribute set_strings set_strings ( strings ) Set PatternNode.strings attribute","title":"PatternNode"},{"location":"patternnode_reference/#utils.PatternNode","text":"PatternNode ( pattern : Pattern ) Class representing a node in a StringTree. Attributes: right ( PatternNode ) \u2013 Node with matching strings. left ( PatternNode ) \u2013 Node with non-matching strings. pattern ( Pattern ) \u2013 Attributed Pattern object. matches \u2013 Strings which match a Pattern object attributed to the PatternNode. strings \u2013 All strings attributed to the PatternNode. labels \u2013 Labels of all strings attributed to the PatternNode. scores \u2013 Scores of an attributed pattern applied to the attributed strings. Initialize a PatternNode. Parameters: pattern ( Pattern ) \u2013 A Pattern object attributed to a node.","title":"PatternNode"},{"location":"patternnode_reference/#utils.PatternNode.get_labels","text":"get_labels () Get PatternNode.labels attribute","title":"get_labels"},{"location":"patternnode_reference/#utils.PatternNode.get_matches","text":"get_matches () Get PatternNode.matches attribute","title":"get_matches"},{"location":"patternnode_reference/#utils.PatternNode.get_pattern","text":"get_pattern () Get PatternNode.pattern attribute","title":"get_pattern"},{"location":"patternnode_reference/#utils.PatternNode.get_scores","text":"get_scores () Get PatternNode.scores attribute","title":"get_scores"},{"location":"patternnode_reference/#utils.PatternNode.get_strings","text":"get_strings () Get PatternNode.strings attribute","title":"get_strings"},{"location":"patternnode_reference/#utils.PatternNode.set_labels","text":"set_labels ( labels ) Set PatternNode.labels attribute","title":"set_labels"},{"location":"patternnode_reference/#utils.PatternNode.set_matches","text":"set_matches ( matches ) Set PatternNode.matches attribute","title":"set_matches"},{"location":"patternnode_reference/#utils.PatternNode.set_pattern","text":"set_pattern ( pattern ) Set PatternNode.pattern attribute","title":"set_pattern"},{"location":"patternnode_reference/#utils.PatternNode.set_scores","text":"set_scores ( scores ) Set PatternNode.scores attribute","title":"set_scores"},{"location":"patternnode_reference/#utils.PatternNode.set_strings","text":"set_strings ( strings ) Set PatternNode.strings attribute","title":"set_strings"},{"location":"stringtree_reference/","text":"StringTree StringTree () A class for binary classification of strings with regular expressions. Each node is an instance of the PatternNode class. It contains a regular expression and metadata. Attributes: root ( PatternNode ) \u2013 The root PatternNode. leaves ( list [ PatternNode ] ) \u2013 List of all nodes. Initialize a StringTree object. build build ( strings , labels , min_precision = 0.5 , min_token_length = 1 , max_patterns = None , min_matches_leaf = 1 , min_strings_leaf = 1 , verbose = False , ) Build a StringTree. For the StringTree object being used, create nodes and corresponding patterns. Use provided strings and labels. Parameters: strings ( list [ str ] ) \u2013 List of strings. labels ( list [ int ] ) \u2013 List of labels (0 or 1). min_precision ( float , default: 0.5 ) \u2013 The minimal precision of a pattern in the tree. min_token_length ( int , default: 1 ) \u2013 The initial length of the pattern. max_patterns ( int , default: None ) \u2013 The highest amount of patterns. Once the method finds more, it stops. min_matches_leaf ( int , default: 1 ) \u2013 The minimal amount of matches in one node. min_strings_leaf ( int , default: 1 ) \u2013 The minimal amount of strings in one node. verbose ( bool , default: False ) \u2013 If to provide additinal text output. filter filter ( strings , return_nodes = False ) Return strings matching the tree and corresponding nodes. A string matches a tree if it matches at least one node. Parameters: strings ( list [ str ] ) \u2013 List of strings. return_nodes ( bool , default: False ) \u2013 Flag indicating if to return nodes corresponding to the matched strings. If False, only matched strings are returned. Returns: matches ( list [ int ] ) \u2013 List containing matching strings. matched_nodes ( list [ PatternNode ] ) \u2013 List consisting of PatternNodes of matching strings. Returned only if return_nodes is True. get_leaves get_leaves () Get leaves attribute. get_nodes_by_label get_nodes_by_label ( label ) Get nodes where the label is the most probable. match match ( strings , return_nodes = False ) Return flags indicating if strings match the tree. A string matches a tree if it matches at least one node. Parameters: strings ( list [ str ] ) \u2013 List of strings. return_nodes ( bool , default: False ) \u2013 Flag indicating if to return nodes corresponding to the matched strings. If False, only matched strings are returned. Returns: matches ( list [ int ] ) \u2013 List containing 1 (match) and 0 (no match) for each string. matched_nodes ( list [ PatternNode ] ) \u2013 List consisting of PatternNodes of matching strings. If not match found, None is retured. Returned only if return_nodes is True. precision_score precision_score ( strings , labels ) Calculate a precision score for given strings and labels. predict_label predict_label ( strings , return_nodes = False ) Predict labels for given strings. recall_score recall_score ( strings , labels ) Calculate a recall score for given strings and labels. set_leaves set_leaves ( leaves ) Set leaves attribute.","title":"StringTree"},{"location":"stringtree_reference/#utils.StringTree","text":"StringTree () A class for binary classification of strings with regular expressions. Each node is an instance of the PatternNode class. It contains a regular expression and metadata. Attributes: root ( PatternNode ) \u2013 The root PatternNode. leaves ( list [ PatternNode ] ) \u2013 List of all nodes. Initialize a StringTree object.","title":"StringTree"},{"location":"stringtree_reference/#utils.StringTree.build","text":"build ( strings , labels , min_precision = 0.5 , min_token_length = 1 , max_patterns = None , min_matches_leaf = 1 , min_strings_leaf = 1 , verbose = False , ) Build a StringTree. For the StringTree object being used, create nodes and corresponding patterns. Use provided strings and labels. Parameters: strings ( list [ str ] ) \u2013 List of strings. labels ( list [ int ] ) \u2013 List of labels (0 or 1). min_precision ( float , default: 0.5 ) \u2013 The minimal precision of a pattern in the tree. min_token_length ( int , default: 1 ) \u2013 The initial length of the pattern. max_patterns ( int , default: None ) \u2013 The highest amount of patterns. Once the method finds more, it stops. min_matches_leaf ( int , default: 1 ) \u2013 The minimal amount of matches in one node. min_strings_leaf ( int , default: 1 ) \u2013 The minimal amount of strings in one node. verbose ( bool , default: False ) \u2013 If to provide additinal text output.","title":"build"},{"location":"stringtree_reference/#utils.StringTree.filter","text":"filter ( strings , return_nodes = False ) Return strings matching the tree and corresponding nodes. A string matches a tree if it matches at least one node. Parameters: strings ( list [ str ] ) \u2013 List of strings. return_nodes ( bool , default: False ) \u2013 Flag indicating if to return nodes corresponding to the matched strings. If False, only matched strings are returned. Returns: matches ( list [ int ] ) \u2013 List containing matching strings. matched_nodes ( list [ PatternNode ] ) \u2013 List consisting of PatternNodes of matching strings. Returned only if return_nodes is True.","title":"filter"},{"location":"stringtree_reference/#utils.StringTree.get_leaves","text":"get_leaves () Get leaves attribute.","title":"get_leaves"},{"location":"stringtree_reference/#utils.StringTree.get_nodes_by_label","text":"get_nodes_by_label ( label ) Get nodes where the label is the most probable.","title":"get_nodes_by_label"},{"location":"stringtree_reference/#utils.StringTree.match","text":"match ( strings , return_nodes = False ) Return flags indicating if strings match the tree. A string matches a tree if it matches at least one node. Parameters: strings ( list [ str ] ) \u2013 List of strings. return_nodes ( bool , default: False ) \u2013 Flag indicating if to return nodes corresponding to the matched strings. If False, only matched strings are returned. Returns: matches ( list [ int ] ) \u2013 List containing 1 (match) and 0 (no match) for each string. matched_nodes ( list [ PatternNode ] ) \u2013 List consisting of PatternNodes of matching strings. If not match found, None is retured. Returned only if return_nodes is True.","title":"match"},{"location":"stringtree_reference/#utils.StringTree.precision_score","text":"precision_score ( strings , labels ) Calculate a precision score for given strings and labels.","title":"precision_score"},{"location":"stringtree_reference/#utils.StringTree.predict_label","text":"predict_label ( strings , return_nodes = False ) Predict labels for given strings.","title":"predict_label"},{"location":"stringtree_reference/#utils.StringTree.recall_score","text":"recall_score ( strings , labels ) Calculate a recall score for given strings and labels.","title":"recall_score"},{"location":"stringtree_reference/#utils.StringTree.set_leaves","text":"set_leaves ( leaves ) Set leaves attribute.","title":"set_leaves"}]}
\ No newline at end of file
diff --git a/site/sitemap.xml.gz b/site/sitemap.xml.gz
index ab96ed1..877dbc5 100644
Binary files a/site/sitemap.xml.gz and b/site/sitemap.xml.gz differ
diff --git a/site/stringtree_reference/index.html b/site/stringtree_reference/index.html
index 024dc11..70454fc 100644
--- a/site/stringtree_reference/index.html
+++ b/site/stringtree_reference/index.html
@@ -89,31 +89,39 @@
- StringTree
- - build()
+
- build
- - filter()
+
- filter
- - get_leaves()
+
- get_leaves
- - match()
+
- get_nodes_by_label
- - precision_score()
+
- match
- - recall_score()
+
- precision_score
- - set_leaves()
+
- predict_label
+
+
+ - recall_score
+
+
+ - set_leaves
@@ -128,12 +136,12 @@
-
- StringTree
+
+ StringTree
-StringTree()
+
@@ -142,10 +150,12 @@
A class for binary classification of strings with regular expressions.
Each node is an instance of the PatternNode class. It contains a regular expression and metadata.
+
+
Attributes:
-
Initialize a StringTree object.
@@ -178,16 +187,26 @@
+
-
- build
+
+ build
-build(strings, labels, min_precision = 0.5, min_token_length = 1, max_patterns = None, min_matches_leaf = 1, min_strings_leaf = 1, verbose = False)
+build(
+ strings,
+ labels,
+ min_precision=0.5,
+ min_token_length=1,
+ max_patterns=None,
+ min_matches_leaf=1,
+ min_strings_leaf=1,
+ verbose=False,
+)
@@ -195,10 +214,12 @@
Build a StringTree.
For the StringTree object being used, create nodes and corresponding patterns. Use provided strings and labels.
+
+
Parameters:
-
- strings
+
strings
(list[str]
)
–
@@ -206,7 +227,7 @@
-
- labels
+
labels
(list[int]
)
–
@@ -214,48 +235,60 @@
-
- min_precision
- (
float
)
+ min_precision
+ (float
, default:
+ 0.5
+)
–
The minimal precision of a pattern in the tree.
-
- min_token_length
- (
int
)
+ min_token_length
+ (int
, default:
+ 1
+)
–
The initial length of the pattern.
-
- max_patterns
- (
int
)
+ max_patterns
+ (int
, default:
+ None
+)
–
The highest amount of patterns. Once the method finds more, it stops.
-
- min_matches_leaf
- (
int
)
+ min_matches_leaf
+ (int
, default:
+ 1
+)
–
The minimal amount of matches in one node.
-
- min_strings_leaf
- (
int
)
+ min_strings_leaf
+ (int
, default:
+ 1
+)
–
The minimal amount of strings in one node.
-
- verbose
- (
bool
)
+ verbose
+ (bool
, default:
+ False
+)
–
If to provide additinal text output.
@@ -267,16 +300,17 @@
+
-
- filter
+
+ filter
-filter(strings, return_nodes = False)
+filter(strings, return_nodes=False)
@@ -284,10 +318,12 @@
Return strings matching the tree and corresponding nodes.
A string matches a tree if it matches at least one node.
+
+
Parameters:
-
- strings
+
strings
(list[str]
)
–
@@ -295,8 +331,10 @@
-
- return_nodes
- (
bool
)
+ return_nodes
+ (bool
, default:
+ False
+)
–
Flag indicating if to return nodes corresponding to the matched strings.
@@ -305,17 +343,19 @@
+
+
Returns:
-
-matches(
list[int]
+matches
( list[int]
) –
List containing matching strings.
-
-matched_nodes(
list[PatternNode]
+matched_nodes
( list[PatternNode]
) –
List consisting of PatternNodes of matching strings. Returned only if return_nodes is True.
@@ -327,16 +367,17 @@
+
-
- get_leaves
+
+ get_leaves
-get_leaves()
+
@@ -347,16 +388,38 @@
+
+
+
+
+
+
+ get_nodes_by_label
+
+
+
+
get_nodes_by_label(label)
+
+
+
+
+
Get nodes where the label is the most probable.
+
+
+
+
+
+
-
- match
+
+ match
-match(strings, return_nodes = False)
+match(strings, return_nodes=False)
@@ -364,10 +427,12 @@
Return flags indicating if strings match the tree.
A string matches a tree if it matches at least one node.
+
+
Parameters:
-
- strings
+
strings
(list[str]
)
–
@@ -375,8 +440,10 @@
-
- return_nodes
- (
bool
)
+ return_nodes
+ (bool
, default:
+ False
+)
–
Flag indicating if to return nodes corresponding to the matched strings.
@@ -385,20 +452,23 @@
+
+
Returns:
-
-matches(
list[int]
+matches
( list[int]
) –
List containing 1 (match) and 0 (no match) for each string.
-
-matched_nodes(
list[PatternNode]
+matched_nodes
( list[PatternNode]
) –
-
List consisting of PatternNodes of matching strings. Returned only if return_nodes is True.
+
List consisting of PatternNodes of matching strings. If not match found, None is retured.
+Returned only if return_nodes is True.
@@ -407,16 +477,17 @@
+
-
- precision_score
+
+ precision_score
-precision_score(strings, labels)
+precision_score(strings, labels)
@@ -427,16 +498,38 @@
+
-
- recall_score
+
+ predict_label
-recall_score(strings, labels)
+predict_label(strings, return_nodes=False)
+
+
+
+
+
Predict labels for given strings.
+
+
+
+
+
+
+
+
+
+
+
+ recall_score
+
+
+
+
recall_score(strings, labels)
@@ -447,16 +540,17 @@
+
-
- set_leaves
+
+ set_leaves
-set_leaves(leaves)
+
@@ -473,6 +567,7 @@
+
diff --git a/src/strtree/utils.py b/src/strtree/utils.py
index 5ab92ca..17be469 100644
--- a/src/strtree/utils.py
+++ b/src/strtree/utils.py
@@ -5,6 +5,7 @@
class Pattern:
"""Class for representing a pattern (a regular expression)."""
+
def __init__(self, pattern):
"""Initialize a Pattern.
@@ -31,9 +32,10 @@ def __repr__(self):
def __copy__(self):
return Pattern(self.str)
- def scores(self, strings, labels):
+ def _binary_class_scores(self, strings, labels):
"""Calculate classification quality scores for pattern's matches and the labels.
+ Labels must be binary (must consist of 0 and 1 only).
Pattern's matches are considered as "predictions" when calculating quality metrics.
Parameters
@@ -41,33 +43,29 @@ def scores(self, strings, labels):
strings : list[str]
A list of strings.
labels : list[int]
- A list of strings labels consisting of 0 and 1.
+ A list of strings labels.
Returns
-------
response : dict
- Contains keys: 'n_strings', 'total_positive', 'n_matches', 'precision', 'recall' and 'accuracy'.
+ Contains keys: 'n_strings', 'total_positive', 'n_matches', 'precision', 'recall', 'accuracy'.
"""
true_positive = 0
true_negative = 0
total_positive = sum(labels)
n_strings = len(strings)
- n_matches = 0
- for i in range(n_strings):
- match = self.match(strings[i])
- if match:
- n_matches += 1
- if labels[i] == 1:
- true_positive += 1
- else:
- if labels[i] == 0:
- true_negative += 1
+
+ matches = np.array(list(map(self.match, strings)))
+ n_matches = np.sum(matches)
+ true_positive = np.sum(matches & labels)
+ true_negative = np.sum(~matches & (labels == 0))
+
if n_matches == 0:
precision = 0.
accuracy = 0.
else:
precision = true_positive / n_matches
- accuracy = (true_positive + true_negative) / n_matches
+ accuracy = (true_positive + true_negative) / n_strings
if total_positive == 0:
recall = 0.
else:
@@ -82,48 +80,73 @@ def scores(self, strings, labels):
}
return response
+ def scores(self, strings, labels, full_labels_list=None):
+ """Calculate classification quality scores for pattern's matches and the labels.
+
+ Pattern's matches are considered as "predictions" when calculating quality metrics.
+
+ Parameters
+ ----------
+ strings : list[str]
+ A list of strings.
+ labels : list[int]
+ A list of strings labels.
+ full_labels_list : list[int]
+ A full list of unique labels that can be present among labels.
+
+ Returns
+ -------
+ response : dict
+ Contains keys: 'n_strings', 'total_positive', 'n_matches', 'precision', 'recall' and 'accuracy'.
+ If there are more than 2 classes, each key is a list corresponding to each class.
+ """
+ max_label = max(labels)
+ if max_label > 1: # is multiclass
+ if full_labels_list is not None:
+ label_names = full_labels_list
+ else:
+ label_names = np.unique(labels)
+ scores = {
+ 'n_strings': [],
+ 'total_positive': [],
+ 'n_matches': [],
+ 'precision': [],
+ 'recall': [],
+ 'accuracy': []
+ }
+ for label_name in label_names:
+ one_vs_rest_labels = (np.array(labels) == label_name).astype('int32')
+ class_scores = self._binary_class_scores(strings, one_vs_rest_labels)
+ scores['n_strings'] = class_scores['n_strings']
+ scores['total_positive'].append(class_scores['total_positive'])
+ scores['n_matches'] = class_scores['n_matches']
+ scores['precision'].append(class_scores['precision'])
+ scores['recall'].append(class_scores['recall'])
+ scores['accuracy'].append(class_scores['accuracy'])
+ return scores
+ else:
+ return self._binary_class_scores(strings, labels)
+
def match(self, string):
"""Verify if the pattern matches the string (at any place)."""
return self.regex.search(string) is not None
- def filter(self, strings, labels=None):
- """Return strings and labels which match the pattern and which don't.
+ def filter(self, strings):
+ """Return a boolean mask for strings matching the pattern.
Parameters
----------
strings : list[str]
Strings to filter.
- labels : list[int], default None
- Labels of strings.
Returns
-------
- matched_strings
- Strings which match the pattern.
- labels_of_matched
- Labels of strings which match the pattern. If no labels provided, an empty list.
- not_matched_strings
- Strings which don't match the pattern.
- labels_of_not_matched
- Labels of strings which don't match the pattern. If no labels provided, an empty list.
+ matches
+ Numpy array of bool type.
"""
- matched_strings = []
- labels_of_matched = []
- not_matched_strings = []
- labels_of_not_matched = []
-
- for string_i in range(len(strings)):
- match = self.match(strings[string_i])
- if match:
- matched_strings.append(strings[string_i])
- if labels is not None:
- labels_of_matched.append(labels[string_i])
- else:
- not_matched_strings.append(strings[string_i])
- if labels is not None:
- labels_of_not_matched.append(labels[string_i])
+ matches = np.array(list(map(self.match, strings)))
- return matched_strings, labels_of_matched, not_matched_strings, labels_of_not_matched
+ return matches
class PatternNode:
@@ -157,8 +180,8 @@ def __init__(self, pattern: 'Pattern'):
self.right = None
self.left = None
self._pattern = pattern
- self._matches = None
- self._strings = None
+ self._matches = []
+ self._strings = []
self._labels = None
self._scores = None
@@ -174,6 +197,7 @@ def __repr__(self):
f'right={right_node}, ',
f'left={left_node}, ',
f'n_strings={len(self._strings)}, ',
+ f'n_matches={len(self._matches)}, ',
f'precision={self._scores["precision"]}, '
f'recall={self._scores["recall"]}'
')'
@@ -189,8 +213,6 @@ def get_strings(self):
def set_strings(self, strings):
"""Set PatternNode.strings attribute"""
- if self._strings is not None:
- raise ValueError("PatternNode.strings attribute is immutable once set")
self._strings = strings.copy()
def get_labels(self):
@@ -213,8 +235,6 @@ def get_matches(self):
def set_matches(self, matches):
"""Set PatternNode.matches attribute"""
- if self._matches is not None:
- raise ValueError("PatternNode.matches attribute is immutable once set")
self._matches = matches.copy()
def get_pattern(self):
@@ -293,8 +313,9 @@ def _generate_tokens(strings, labels, length):
"""
n = length
ngrams = {}
+ multiclass = max(labels) > 1
for string_i in range(len(strings)):
- if labels[string_i] == 0: # Don't add ngramgs for negative target.
+ if labels[string_i] == 0 and not multiclass: # Don't add ngramgs for negative target if it is binary classification
continue
string = strings[string_i]
for i in range(len(string) - n + 1):
@@ -385,15 +406,24 @@ def _augment_pattern(strings, labels, current_pattern: 'Pattern', tokens):
"""
candidates_scores = []
pattern_candidates = []
+ multiclass = max(labels) > 1
for token in tokens:
token_patterns = StringTree._combine_patterns(current_pattern, Pattern(token))
for pattern in token_patterns:
scores = pattern.scores(strings, labels)
- precision, recall, n_match = scores['precision'], scores['recall'], scores['n_matches']
- if precision + recall == 0:
- f1_score = 0
+ precision, recall = scores['precision'], scores['recall']
+ if multiclass:
+ max_precision = np.max(precision)
+ max_recall = np.max(recall)
+ if max_precision + max_recall == 0:
+ f1_score = 0
+ else:
+ f1_score = 2*max_precision*max_recall / (max_precision+max_recall)
else:
- f1_score = 2*precision*recall / (precision+recall)
+ if precision + recall == 0:
+ f1_score = 0
+ else:
+ f1_score = 2*precision*recall / (precision+recall)
candidates_scores.append(f1_score)
pattern_candidates.append(pattern)
if len(pattern_candidates) > 0:
@@ -435,6 +465,10 @@ def build(
raise ValueError('min_precision must not be < 0 or > 1')
if max_patterns is None:
max_patterns = np.inf
+
+ multiclass = max(labels) > 1
+ classes = np.unique(labels)
+ self._classes = classes
cur_strings = strings.copy()
cur_labels = labels.copy()
@@ -449,30 +483,38 @@ def build(
for (cur_strings, cur_labels) in evaluation_queue:
if verbose:
- print(f'\nStart processing another {len(cur_strings)} of strings with {sum(cur_labels)} positive labels.')
+ if not multiclass:
+ print(f'\nStart processing another {len(cur_strings)} of strings with {sum(cur_labels)} positive labels.')
+ else:
+ print(f'\nStart processing another {len(cur_strings)} of strings with {len(classes)} classes.')
cur_pattern = Pattern('')
- scores = cur_pattern.scores(cur_strings, cur_labels)
+ scores = cur_pattern.scores(cur_strings, cur_labels, full_labels_list=classes)
cur_node = PatternNode(cur_pattern)
cur_node.scores = scores
cur_node.strings = cur_strings
cur_node.labels = cur_labels
cur_node.matches = cur_strings
-
+
precision = scores['precision']
recall = scores['recall']
n_matches = scores['n_matches']
if verbose:
print(f'Current pattern="{cur_pattern}". N matches: {n_matches}, Precision={precision}, Recall={recall}')
+ if multiclass:
+ precision = np.max(scores['precision'])
+ recall = np.max(scores['recall'])
+ n_matches = np.sum(scores['n_matches'])
+
first_run = True
pattern_was_not_found = False
stop_processing = True
local_cur_strings, local_cur_labels = cur_strings, cur_labels
- while (precision < min_precision
- and sum(local_cur_labels) > 0
+ while cur_pattern.str == '' or (precision < min_precision
+ and (sum(local_cur_labels) > 0 or multiclass)
and n_matches > min_matches_leaf
and len(local_cur_strings) > min_strings_leaf):
stop_processing = False
@@ -489,15 +531,19 @@ def build(
if verbose:
print(f'Pattern was not found. Current pattern="{cur_pattern}". Precision={precision}, Recall={recall}')
break
- scores = best_pattern.scores(local_cur_strings, local_cur_labels)
+ scores = best_pattern.scores(local_cur_strings, local_cur_labels, full_labels_list=classes)
precision = scores['precision']
recall = scores['recall']
n_matches = scores['n_matches']
-
if verbose:
print(f'Best pattern="{best_pattern}". N matches: {n_matches}, Precision={precision}, Recall={recall}')
+ if multiclass:
+ precision = np.max(scores['precision'])
+ recall = np.max(scores['recall'])
+ n_matches = np.sum(scores['n_matches'])
+
if n_matches < min_matches_leaf:
stop_processing = True
if cur_pattern.str == '':
@@ -510,15 +556,20 @@ def build(
break
cur_pattern = best_pattern
-
- local_cur_strings, local_cur_labels, _, _ = \
- cur_pattern.filter(local_cur_strings, local_cur_labels)
+
+ cur_pattern_matches = cur_pattern.filter(local_cur_strings)
+ local_cur_strings = (np.array(local_cur_strings)[cur_pattern_matches]).tolist()
+ local_cur_labels = (np.array(local_cur_labels)[cur_pattern_matches]).tolist()
first_run = False
if pattern_was_not_found:
- cur_strings, cur_labels, not_matched_strings, labels_of_not_matched = \
- cur_pattern.filter(cur_strings, cur_labels)
+ cur_pattern_matches = cur_pattern.filter(cur_strings)
+ not_matched_strings = (np.array(cur_strings)[~cur_pattern_matches]).tolist()
+ labels_of_not_matched = (np.array(cur_labels)[~cur_pattern_matches]).tolist()
+ cur_strings = (np.array(cur_strings)[cur_pattern_matches]).tolist()
+ cur_labels = (np.array(cur_labels)[cur_pattern_matches]).tolist()
+
if sum(labels_of_not_matched) > 0:
evaluation_queue.append((not_matched_strings, labels_of_not_matched))
continue
@@ -528,8 +579,11 @@ def build(
cur_node.strings = cur_strings
cur_node.labels = cur_labels
- cur_strings, cur_labels, not_matched_strings, labels_of_not_matched = \
- cur_pattern.filter(cur_strings, cur_labels)
+ cur_pattern_matches = cur_pattern.filter(cur_strings)
+ not_matched_strings = (np.array(cur_strings)[~cur_pattern_matches]).tolist()
+ labels_of_not_matched = (np.array(cur_labels)[~cur_pattern_matches]).tolist()
+ cur_strings = (np.array(cur_strings)[cur_pattern_matches]).tolist()
+ cur_labels = (np.array(cur_labels)[cur_pattern_matches]).tolist()
cur_node.matches = cur_strings
@@ -543,7 +597,7 @@ def build(
print(f'Best pattern has {len(local_cur_strings)} strings which is less or equal to min_strings_leaf. Processing stopped.')
continue
- if sum(cur_labels) > 0:
+ if sum(cur_labels) > 0 or multiclass:
leaves.append(cur_node)
if verbose:
print('Last pattern was saved')
@@ -635,7 +689,8 @@ def match(self, strings, return_nodes=False):
matches : list[int]
List containing 1 (match) and 0 (no match) for each string.
matched_nodes : list[PatternNode]
- List consisting of PatternNodes of matching strings. Returned only if return_nodes is True.
+ List consisting of PatternNodes of matching strings. If not match found, None is retured.
+ Returned only if return_nodes is True.
"""
if self._leaves is None:
raise ValueError("The StringTree was not built. Run StringTree.build method first.")
@@ -680,3 +735,34 @@ def recall_score(self, strings, labels):
else:
recall_score = 0
return recall_score
+
+ def predict_label(self, strings, return_nodes=False):
+ """Predict labels for given strings."""
+ if self._leaves is None:
+ raise ValueError("The StringTree was not built. Run StringTree.build method first.")
+
+ matches, matched_nodes = self.match(strings, return_nodes=True)
+ if len(self._classes) > 2:
+ get_label = lambda node: self._classes[np.argmax(node._scores['precision'])] if node is not None else None
+ else:
+ get_label = lambda node: self._classes[int(node._scores['precision'] > 0.5)] if node is not None else None
+ predicted_labels = list(map(get_label, matched_nodes))
+
+ if return_nodes:
+ return predicted_labels, matched_nodes
+ return predicted_labels
+
+
+ def get_nodes_by_label(self, label):
+ """Get nodes where the label is the most probable."""
+ if self._leaves is None:
+ raise ValueError("The StringTree was not built. Run StringTree.build method first.")
+
+ if len(self._classes) > 2:
+ get_label = lambda node: self._classes[np.argmax(node._scores['precision'])] if node is not None else None
+ else:
+ get_label = lambda node: self._classes[int(node._scores['precision'] > 0.5)] if node is not None else None
+
+ nodes_labels = np.array(list(map(get_label, self._leaves)))
+
+ return np.array(self._leaves)[nodes_labels == label]
diff --git a/tests/test_main.py b/tests/test_main.py
index 5502585..ba1939e 100644
--- a/tests/test_main.py
+++ b/tests/test_main.py
@@ -26,12 +26,35 @@ def test_tree_methods():
strings = ['Samsung X-500', 'Samsung SM-10', 'Samsung X-1100', 'Samsung F-10', 'Samsung X-2200',
'AB Nokia 1', 'DG Nokia 2', 'THGF Nokia 3', 'SFSD Nokia 4', 'Nokia XG', 'Nokia YO']
- target = [1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0]
+ labels = [1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0]
tree = strtree.StringTree()
- tree.build(strings, target, min_precision=0.75, min_token_length=1, verbose=True)
+ tree.build(strings, labels, min_precision=0.75, min_token_length=1, verbose=True)
assert tree.filter(['OO Nokia 12']) == ['OO Nokia 12']
assert tree.match(['OO Nokia 12']) == [1]
assert tree.precision_score(['OO Nokia 12'], [1]) == 1.0
- assert tree.recall_score(['OO Nokia 12'], [1]) == 1.0
\ No newline at end of file
+ assert tree.recall_score(['OO Nokia 12'], [1]) == 1.0
+ assert tree.predict_label(['OO Nokia 12']) == [1]
+ assert len(tree.get_nodes_by_label(1)) > 0
+
+
+def test_tree_methods_multiclass():
+
+ strings = ['Admiral', 'Apple', 'Age',
+ 'Bee', 'Bubble', 'Butter',
+ 'Color', 'Climate', 'CPU']
+
+ labels = [0, 0, 0,
+ 1, 1, 1,
+ 2, 2, 2]
+
+ tree = strtree.StringTree()
+ tree.build(strings, labels, min_precision=0.75, min_token_length=1, verbose=True)
+
+ assert tree.filter(['Ananas']) == ['Ananas']
+ assert tree.match(['Ananas']) == [1]
+ assert tree.precision_score(['Ananas'], [1]) == 1.0
+ assert tree.recall_score(['Ananas'], [1]) == 1.0
+ assert tree.predict_label(['Ananas']) == [0]
+ assert len(tree.get_nodes_by_label(1)) > 0
\ No newline at end of file