Skip to content

Commit

Permalink
merge conflicts
Browse files Browse the repository at this point in the history
  • Loading branch information
igorbrigadir committed Mar 31, 2021
2 parents ab20107 + f76fc3c commit 7357570
Show file tree
Hide file tree
Showing 7 changed files with 250 additions and 66 deletions.
30 changes: 28 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@ The default model is small English spaCy model (en_core_web_sm, 11Mb) and is ins

Example: installing medium (91 Mb) English model (for more models see [spaCy documentation](https://spacy.io/usage/models)).
```bash
pip install spacy==2.1.3
pip install transformers==2.2.2
pip install spacy
pip install transformers # > 2.2.0
pip install neuralcoref

python -m spacy download en_core_web_md
Expand Down Expand Up @@ -152,6 +152,32 @@ Still the building is among the best known in the city, even to people who have
"""
```


### Calculating Elbow

As of bert-extractive-summarizer version 0.7.1, you can also calculate ELBOW to determine the optimal cluster. Below
shows a sample example in how to retrieve the list of inertias.

```python
from summarizer import Summarizer

body = 'Your Text here.'
model = Summarizer()
res = model.calculate_elbow(body, k_max=10)
print(res)
```

You can also find the optimal number of sentences with elbow using the following algorithm.

```python
from summarizer import Summarizer

body = 'Your Text here.'
model = Summarizer()
res = model.calculate_optimal_k(body, k_max=10)
print(res)
```

## Summarizer Options

```
Expand Down
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,13 @@
from setuptools import find_packages

setup(name='bert-extractive-summarizer',
version='0.6.1.dev3',
version='0.7.1.dev3',
description='Extractive Text Summarization with BERT',
keywords = ['bert', 'pytorch', 'machine learning', 'deep learning', 'extractive summarization', 'summary'],
long_description=open("README.md", "r", encoding='utf-8').read(),
long_description_content_type="text/markdown",
url='https://github.com/dmmiller612/bert-extractive-summarizer',
download_url='https://github.com/dmmiller612/bert-extractive-summarizer/archive/0.6.1.tar.gz',
download_url='https://github.com/dmmiller612/bert-extractive-summarizer/archive/0.7.1.tar.gz',
author='Derek Miller',
author_email='[email protected]',
install_requires=['transformers', 'scikit-learn', 'spacy'],
Expand Down
33 changes: 20 additions & 13 deletions summarizer/bert_parent.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,10 @@ def __init__(
custom_tokenizer: PreTrainedTokenizer=None
):
"""
:param model: Model is the string path for the bert weights. If given a keyword, the s3 path will be used
:param custom_model: This is optional if a custom bert model is used
:param custom_tokenizer: Place to use custom tokenizer
:param model: Model is the string path for the bert weights. If given a keyword, the s3 path will be used.
:param custom_model: This is optional if a custom bert model is used.
:param custom_tokenizer: Place to use custom tokenizer.
"""

base_model, base_tokenizer = self.MODELS.get(model, (None, None))

self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
Expand All @@ -54,8 +53,8 @@ def tokenize_input(self, text: str) -> torch.tensor:
"""
Tokenizes the text input.
:param text: Text to tokenize
:return: Returns a torch tensor
:param text: Text to tokenize.
:return: Returns a torch tensor.
"""
tokenized_text = self.tokenizer.tokenize(text)
indexed_tokens = self.tokenizer.convert_tokens_to_ids(tokenized_text)
Expand Down Expand Up @@ -87,16 +86,15 @@ def extract_embeddings(
) -> torch.Tensor:

"""
Extracts the embeddings for the given text
Extracts the embeddings for the given text.
:param text: The text to extract embeddings for.
:param hidden: The hidden layer(s) to use for a readout handler
:param squeeze: If we should squeeze the outputs (required for some layers)
:param hidden: The hidden layer(s) to use for a readout handler.
:param squeeze: If we should squeeze the outputs (required for some layers).
:param reduce_option: How we should reduce the items.
:param hidden_concat: Whether or not to concat multiple hidden layers.
:return: A torch vector.
"""

tokens_tensor = self.tokenize_input(text)
pooled, hidden_states = self.model(tokens_tensor)[-2:]

Expand Down Expand Up @@ -132,10 +130,10 @@ def create_matrix(
hidden_concat: bool = False
) -> ndarray:
"""
Create matrix from the embeddings
Create matrix from the embeddings.
:param content: The list of sentences
:param hidden: Which hidden layer to use
:param content: The list of sentences.
:param hidden: Which hidden layer to use.
:param reduce_option: The reduce option to run.
:param hidden_concat: Whether or not to concat multiple hidden layers.
:return: A numpy array matrix of the given content.
Expand All @@ -154,4 +152,13 @@ def __call__(
reduce_option: str = 'mean',
hidden_concat: bool = False
) -> ndarray:
"""
Create matrix from the embeddings.
:param content: The list of sentences.
:param hidden: Which hidden layer to use.
:param reduce_option: The reduce option to run.
:param hidden_concat: Whether or not to concat multiple hidden layers.
:return: A numpy array matrix of the given content.
"""
return self.create_matrix(content, hidden, reduce_option, hidden_concat)
92 changes: 68 additions & 24 deletions summarizer/cluster_features.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import List, Dict
from typing import List, Dict, Tuple

import numpy as np
from numpy import ndarray
Expand All @@ -20,12 +20,11 @@ def __init__(
random_state: int = 12345
):
"""
:param features: the embedding matrix created by bert parent
:param algorithm: Which clustering algorithm to use
:param pca_k: If you want the features to be ran through pca, this is the components number
:param random_state: Random state
:param features: the embedding matrix created by bert parent.
:param algorithm: Which clustering algorithm to use.
:param pca_k: If you want the features to be ran through pca, this is the components number.
:param random_state: Random state.
"""

if pca_k:
self.features = PCA(n_components=pca_k).fit_transform(features)
else:
Expand All @@ -37,10 +36,10 @@ def __init__(

def __get_model(self, k: int):
"""
Retrieve clustering model
Retrieve clustering model.
:param k: amount of clusters
:return: Clustering model
:param k: amount of clusters.
:return: Clustering model.
"""

if self.algorithm == 'gmm':
Expand All @@ -49,22 +48,22 @@ def __get_model(self, k: int):

def __get_centroids(self, model):
"""
Retrieve centroids of model
:param model: Clustering model
:return: Centroids
"""
Retrieve centroids of model.
:param model: Clustering model.
:return: Centroids.
"""
if self.algorithm == 'gmm':
return model.means_
return model.cluster_centers_

def __find_closest_args(self, centroids: np.ndarray) -> Dict:
"""
Find the closest arguments to centroid
:param centroids: Centroids to find closest
:return: Closest arguments
"""
Find the closest arguments to centroid.
:param centroids: Centroids to find closest.
:return: Closest arguments.
"""
centroid_min = 1e10
cur_arg = -1
args = {}
Expand All @@ -86,12 +85,57 @@ def __find_closest_args(self, centroids: np.ndarray) -> Dict:

return args

def calculate_elbow(self, k_max: int) -> List[float]:
"""
Calculates elbow up to the provided k_max.
:param k_max: K_max to calculate elbow for.
:return: The inertias up to k_max.
"""
inertias = []

for k in range(1, min(k_max, len(self.features))):
model = self.__get_model(k).fit(self.features)

inertias.append(model.inertia_)

return inertias

def calculate_optimal_cluster(self, k_max: int):
"""
Calculates the optimal cluster based on Elbow.
:param k_max: The max k to search elbow for.
:return: The optimal cluster size.
"""
delta_1 = []
delta_2 = []

max_strength = 0
k = 1

inertias = self.calculate_elbow(k_max)

for i in range(len(inertias)):
delta_1.append(inertias[i] - inertias[i-1] if i > 0 else 0.0)
delta_2.append(delta_1[i] - delta_1[i-1] if i > 1 else 0.0)

for j in range(len(inertias)):
strength = 0 if j <= 1 or j == len(inertias) -1 else delta_2[j+1] - delta_1[j+1]

if strength > max_strength:
max_strength = strength
k = j + 1

return k

def cluster(self, ratio: float = 0.1, num_sentences: int = None) -> List[int]:
"""
Clusters sentences based on the ratio
:param ratio: Ratio to use for clustering
Clusters sentences based on the ratio.
:param ratio: Ratio to use for clustering.
:param num_sentences: Number of sentences. Overrides ratio.
:return: Sentences index that qualify for summary
:return: Sentences index that qualify for summary.
"""

if num_sentences is not None:
Expand All @@ -112,10 +156,10 @@ def cluster(self, ratio: float = 0.1, num_sentences: int = None) -> List[int]:

def __call__(self, ratio: float = 0.1, num_sentences: int = None) -> List[int]:
"""
Clusters sentences based on the ratio
:param ratio: Ratio to use for clustering
Clusters sentences based on the ratio.
:param ratio: Ratio to use for clustering.
:param num_sentences: Number of sentences. Overrides ratio.
:return: Sentences index that qualify for summary
:return: Sentences index that qualify for summary.
"""

return self.cluster(ratio)
Loading

0 comments on commit 7357570

Please sign in to comment.