merge conflicts

recsyslabs · Mar 31, 2021 · 7357570 · 7357570
2 parents ab20107 + f76fc3c
commit 7357570
Show file tree

Hide file tree

Showing 7 changed files with 250 additions and 66 deletions.
diff --git a/README.md b/README.md
@@ -28,8 +28,8 @@ The default model is small English spaCy model (en_core_web_sm, 11Mb) and is ins
 
 Example: installing medium (91 Mb) English model (for more models see [spaCy documentation](https://spacy.io/usage/models)). 
 ```bash
-pip install spacy==2.1.3
-pip install transformers==2.2.2
+pip install spacy
+pip install transformers # > 2.2.0
 pip install neuralcoref
 
 python -m spacy download en_core_web_md
@@ -152,6 +152,32 @@ Still the building is among the best known in the city, even to people who have
 """
 ```
 
+
+### Calculating Elbow
+
+As of bert-extractive-summarizer version 0.7.1, you can also calculate ELBOW to determine the optimal cluster. Below 
+shows a sample example in how to retrieve the list of inertias.
+
+```python
+from summarizer import Summarizer
+
+body = 'Your Text here.'
+model = Summarizer()
+res = model.calculate_elbow(body, k_max=10)
+print(res)
+```
+
+You can also find the optimal number of sentences with elbow using the following algorithm.
+
+```python
+from summarizer import Summarizer
+
+body = 'Your Text here.'
+model = Summarizer()
+res = model.calculate_optimal_k(body, k_max=10)
+print(res)
+```
+
 ## Summarizer Options
 
 ```

diff --git a/setup.py b/setup.py
@@ -2,13 +2,13 @@
 from setuptools import find_packages
 
 setup(name='bert-extractive-summarizer',
-      version='0.6.1.dev3',
+      version='0.7.1.dev3',
       description='Extractive Text Summarization with BERT',
       keywords = ['bert', 'pytorch', 'machine learning', 'deep learning', 'extractive summarization', 'summary'],
       long_description=open("README.md", "r", encoding='utf-8').read(),
       long_description_content_type="text/markdown",
       url='https://github.com/dmmiller612/bert-extractive-summarizer',
-      download_url='https://github.com/dmmiller612/bert-extractive-summarizer/archive/0.6.1.tar.gz',
+      download_url='https://github.com/dmmiller612/bert-extractive-summarizer/archive/0.7.1.tar.gz',
       author='Derek Miller',
       author_email='[email protected]',
       install_requires=['transformers', 'scikit-learn', 'spacy'],

diff --git a/summarizer/bert_parent.py b/summarizer/bert_parent.py
@@ -29,11 +29,10 @@ def __init__(
         custom_tokenizer: PreTrainedTokenizer=None
     ):
         """
-        :param model: Model is the string path for the bert weights. If given a keyword, the s3 path will be used
-        :param custom_model: This is optional if a custom bert model is used
-        :param custom_tokenizer: Place to use custom tokenizer
+        :param model: Model is the string path for the bert weights. If given a keyword, the s3 path will be used.
+        :param custom_model: This is optional if a custom bert model is used.
+        :param custom_tokenizer: Place to use custom tokenizer.
         """
-
         base_model, base_tokenizer = self.MODELS.get(model, (None, None))
 
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
@@ -54,8 +53,8 @@ def tokenize_input(self, text: str) -> torch.tensor:
         """
         Tokenizes the text input.
 
-        :param text: Text to tokenize
-        :return: Returns a torch tensor
+        :param text: Text to tokenize.
+        :return: Returns a torch tensor.
         """
         tokenized_text = self.tokenizer.tokenize(text)
         indexed_tokens = self.tokenizer.convert_tokens_to_ids(tokenized_text)
@@ -87,16 +86,15 @@ def extract_embeddings(
     ) -> torch.Tensor:
 
         """
-        Extracts the embeddings for the given text
+        Extracts the embeddings for the given text.
 
         :param text: The text to extract embeddings for.
-        :param hidden: The hidden layer(s) to use for a readout handler
-        :param squeeze: If we should squeeze the outputs (required for some layers)
+        :param hidden: The hidden layer(s) to use for a readout handler.
+        :param squeeze: If we should squeeze the outputs (required for some layers).
         :param reduce_option: How we should reduce the items.
         :param hidden_concat: Whether or not to concat multiple hidden layers.
         :return: A torch vector.
         """
-
         tokens_tensor = self.tokenize_input(text)
         pooled, hidden_states = self.model(tokens_tensor)[-2:]
 
@@ -132,10 +130,10 @@ def create_matrix(
         hidden_concat: bool = False
     ) -> ndarray:
         """
-        Create matrix from the embeddings
+        Create matrix from the embeddings.
 
-        :param content: The list of sentences
-        :param hidden: Which hidden layer to use
+        :param content: The list of sentences.
+        :param hidden: Which hidden layer to use.
         :param reduce_option: The reduce option to run.
         :param hidden_concat: Whether or not to concat multiple hidden layers.
         :return: A numpy array matrix of the given content.
@@ -154,4 +152,13 @@ def __call__(
         reduce_option: str = 'mean',
         hidden_concat: bool = False
     ) -> ndarray:
+        """
+        Create matrix from the embeddings.
+
+        :param content: The list of sentences.
+        :param hidden: Which hidden layer to use.
+        :param reduce_option: The reduce option to run.
+        :param hidden_concat: Whether or not to concat multiple hidden layers.
+        :return: A numpy array matrix of the given content.
+        """
         return self.create_matrix(content, hidden, reduce_option, hidden_concat)
diff --git a/summarizer/cluster_features.py b/summarizer/cluster_features.py
@@ -1,4 +1,4 @@
-from typing import List, Dict
+from typing import List, Dict, Tuple
 
 import numpy as np
 from numpy import ndarray
@@ -20,12 +20,11 @@ def __init__(
         random_state: int = 12345
     ):
         """
-        :param features: the embedding matrix created by bert parent
-        :param algorithm: Which clustering algorithm to use
-        :param pca_k: If you want the features to be ran through pca, this is the components number
-        :param random_state: Random state
+        :param features: the embedding matrix created by bert parent.
+        :param algorithm: Which clustering algorithm to use.
+        :param pca_k: If you want the features to be ran through pca, this is the components number.
+        :param random_state: Random state.
         """
-
         if pca_k:
             self.features = PCA(n_components=pca_k).fit_transform(features)
         else:
@@ -37,10 +36,10 @@ def __init__(
 
     def __get_model(self, k: int):
         """
-        Retrieve clustering model
+        Retrieve clustering model.
 
-        :param k: amount of clusters
-        :return: Clustering model
+        :param k: amount of clusters.
+        :return: Clustering model.
         """
 
         if self.algorithm == 'gmm':
@@ -49,22 +48,22 @@ def __get_model(self, k: int):
 
     def __get_centroids(self, model):
         """
-        Retrieve centroids of model
-        :param model: Clustering model
-        :return: Centroids
-        """
+        Retrieve centroids of model.
 
+        :param model: Clustering model.
+        :return: Centroids.
+        """
         if self.algorithm == 'gmm':
             return model.means_
         return model.cluster_centers_
 
     def __find_closest_args(self, centroids: np.ndarray) -> Dict:
         """
-        Find the closest arguments to centroid
-        :param centroids: Centroids to find closest
-        :return: Closest arguments
-        """
+        Find the closest arguments to centroid.
 
+        :param centroids: Centroids to find closest.
+        :return: Closest arguments.
+        """
         centroid_min = 1e10
         cur_arg = -1
         args = {}
@@ -86,12 +85,57 @@ def __find_closest_args(self, centroids: np.ndarray) -> Dict:
 
         return args
 
+    def calculate_elbow(self, k_max: int) -> List[float]:
+        """
+        Calculates elbow up to the provided k_max.
+
+        :param k_max: K_max to calculate elbow for.
+        :return: The inertias up to k_max.
+        """
+        inertias = []
+
+        for k in range(1, min(k_max, len(self.features))):
+            model = self.__get_model(k).fit(self.features)
+
+            inertias.append(model.inertia_)
+
+        return inertias
+
+    def calculate_optimal_cluster(self, k_max: int):
+        """
+        Calculates the optimal cluster based on Elbow.
+
+        :param k_max: The max k to search elbow for.
+        :return: The optimal cluster size.
+        """
+        delta_1 = []
+        delta_2 = []
+
+        max_strength = 0
+        k = 1
+
+        inertias = self.calculate_elbow(k_max)
+
+        for i in range(len(inertias)):
+            delta_1.append(inertias[i] - inertias[i-1] if i > 0 else 0.0)
+            delta_2.append(delta_1[i] - delta_1[i-1] if i > 1 else 0.0)
+
+        for j in range(len(inertias)):
+            strength = 0 if j <= 1 or j == len(inertias) -1 else delta_2[j+1] - delta_1[j+1]
+
+            if strength > max_strength:
+                max_strength = strength
+                k = j + 1
+
+        return k
+
     def cluster(self, ratio: float = 0.1, num_sentences: int = None) -> List[int]:
         """
-        Clusters sentences based on the ratio
-        :param ratio: Ratio to use for clustering
+        Clusters sentences based on the ratio.
+
+        :param ratio: Ratio to use for clustering.
         :param num_sentences: Number of sentences. Overrides ratio.
-        :return: Sentences index that qualify for summary
+        :return: Sentences index that qualify for summary.
         """
 
         if num_sentences is not None:
@@ -112,10 +156,10 @@ def cluster(self, ratio: float = 0.1, num_sentences: int = None) -> List[int]:
 
     def __call__(self, ratio: float = 0.1, num_sentences: int = None) -> List[int]:
         """
-        Clusters sentences based on the ratio
-        :param ratio: Ratio to use for clustering
+        Clusters sentences based on the ratio.
+
+        :param ratio: Ratio to use for clustering.
         :param num_sentences: Number of sentences. Overrides ratio.
-        :return: Sentences index that qualify for summary
+        :return: Sentences index that qualify for summary.
         """
-
         return self.cluster(ratio)