Skip to content

Commit

Permalink
nbdev migration commit
Browse files Browse the repository at this point in the history
  • Loading branch information
amaiya committed Jun 15, 2024
1 parent b616989 commit f41cba2
Show file tree
Hide file tree
Showing 70 changed files with 1,099 additions and 13,541 deletions.
16 changes: 0 additions & 16 deletions .gitconfig

This file was deleted.

14 changes: 14 additions & 0 deletions .github/workflows/deploy.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
name: Deploy to GitHub Pages

permissions:
contents: write
pages: write

on:
push:
branches: [ "main", "master" ]
workflow_dispatch:
jobs:
deploy:
runs-on: ubuntu-latest
steps: [uses: fastai/workflows/quarto-ghp@master]
33 changes: 0 additions & 33 deletions .github/workflows/main.yml

This file was deleted.

7 changes: 7 additions & 0 deletions .github/workflows/test.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
name: CI
on: [workflow_dispatch, pull_request, push]

jobs:
test:
runs-on: ubuntu-latest
steps: [uses: fastai/workflows/nbdev-ci@master]
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -127,3 +127,6 @@ dmypy.json

# Pyre type checker
.pyre/
_docs/
_proc/

37 changes: 0 additions & 37 deletions Makefile

This file was deleted.

450 changes: 450 additions & 0 deletions causalnlp/_modidx.py

Large diffs are not rendered by default.

55 changes: 26 additions & 29 deletions causalnlp/analyzers.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
# AUTOGENERATED! DO NOT EDIT! File to edit: nbs/02_analyzers.ipynb (unless otherwise specified).
# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/02_analyzers.ipynb.

__all__ = ['list2chunks', 'ZeroShotClassifier', 'TextEncoder', 'TopicModel', 'DEFAULT_TOKEN_PATTERN']

# Cell
# %% auto 0
__all__ = ['DEFAULT_TOKEN_PATTERN', 'list2chunks', 'ZeroShotClassifier', 'TextEncoder', 'TopicModel']

# %% ../nbs/02_analyzers.ipynb 4
import math
import warnings
import numpy as np
Expand All @@ -13,8 +13,7 @@ def list2chunks(a, n):
k, m = divmod(len(a), n)
return (a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n))

# Cell

# %% ../nbs/02_analyzers.ipynb 5
class ZeroShotClassifier():
"""
Interface to Zero Shot Topic Classifier
Expand Down Expand Up @@ -61,22 +60,22 @@ def predict(self, docs, labels=[], include_labels=False, multilabel=True,
if len(topic_strings) is large.
- nli_template(str): labels are inserted into this template for use as hypotheses in natural language inference
- topic_strings(list): alias for labels parameter for backwards compatibility
**Returns:**
inferred probabilities or list of inferred probabilities if doc is list
"""

# error checks
is_str_input = False
if not isinstance(docs, (list, np.ndarray)):
if not isinstance(docs, (list, np.ndarray)):
docs = [docs]
is_str_input = True
if not isinstance(docs[0], str): raise ValueError('docs must be string or a list of strings representing document(s)')
if len(labels) > 0 and len(topic_strings) > 0: raise ValueError('labels and topic_strings are mutually exclusive')
if not labels and not topic_strings: raise ValueError('labels must be a list of strings')
if topic_strings:
if topic_strings:
labels = topic_strings


Expand Down Expand Up @@ -117,8 +116,7 @@ def predict(self, docs, labels=[], include_labels=False, multilabel=True,
if is_str_input: scores = scores[0]
return scores

# Cell

# %% ../nbs/02_analyzers.ipynb 10
#from sentence_transformers import SentenceTransformer, util

class TextEncoder():
Expand Down Expand Up @@ -146,16 +144,15 @@ def __init__(self, model_name='stsb-roberta-large', device=None):
self.torch_device = device
if self.torch_device is None: self.torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
self.model = SentenceTransformer(model_name)

def encode(self, texts, batch_size=32, normalize=False, show_progress_bar=False):
"""Generate embedding for supplied text"""
if isinstance(texts, str): texts = [texts]
return self.model.encode(texts, batch_size=batch_size,
show_progress_bar=show_progress_bar, normalize_embeddings=normalize,
convert_to_tensor=False, device=self.torch_device )

# Cell

# %% ../nbs/02_analyzers.ipynb 16
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
import math
Expand All @@ -166,7 +163,7 @@ def encode(self, texts, batch_size=32, normalize=False, show_progress_bar=False)
class TopicModel():


def __init__(self,texts=None, n_topics=None, n_features=10000,
def __init__(self,texts=None, n_topics=None, n_features=10000,
min_df=5, max_df=0.5, stop_words='english',
model_type='lda',
lda_max_iter=5, lda_mode='online',
Expand All @@ -176,7 +173,7 @@ def __init__(self,texts=None, n_topics=None, n_features=10000,
"""
Fits a topic model to documents in <texts>.
Example:
tm = ktrain.text.get_topic_model(docs, n_topics=20,
tm = ktrain.text.get_topic_model(docs, n_topics=20,
n_features=1000, min_df=2, max_df=0.95)
Args:
texts (list of str): list of texts
Expand All @@ -191,7 +188,7 @@ def __init__(self,texts=None, n_topics=None, n_features=10000,
If lda_mode='batch', this should be increased (e.g., 1500).
Ignored if model_type != 'lda'
lda_mode (str): one of {'online', 'batch'}. Ignored if model_type !='lda'
token_pattern(str): regex pattern to use to tokenize documents.
token_pattern(str): regex pattern to use to tokenize documents.
verbose(bool): verbosity
"""
self.verbose=verbose
Expand All @@ -208,7 +205,7 @@ def __init__(self,texts=None, n_topics=None, n_features=10000,
if texts is not None:
(model, vectorizer) = self.train(texts, model_type=model_type,
n_topics=n_topics, n_features=n_features,
min_df = min_df, max_df = max_df,
min_df = min_df, max_df = max_df,
stop_words=stop_words,
lda_max_iter=lda_max_iter, lda_mode=lda_mode,
token_pattern=token_pattern,
Expand Down Expand Up @@ -258,7 +255,7 @@ def train(self,texts, model_type='lda', n_topics=None, n_features=10000,
vectorizer = CountVectorizer(max_df=max_df, min_df=min_df,
max_features=n_features, stop_words=stop_words,
token_pattern=token_pattern, ngram_range=ngram_range)


x_train = vectorizer.fit_transform(texts)

Expand Down Expand Up @@ -321,7 +318,7 @@ def get_word_weights(self, topic_id, n_words=100):
Returns a list tuples of the form: (word, weight) for given topic_id.
"""
self._check_model()
if topic_id+1 > len(self.model.components_):
if topic_id+1 > len(self.model.components_):
raise ValueError('topic_id must be less than %s' % (len(self.model.components_)))
feature_names = self.vectorizer.get_feature_names()
word_probs = self.model.components_[topic_id]
Expand Down Expand Up @@ -352,7 +349,7 @@ def print_topics(self, n_words=10, show_counts=False):
topics = self.get_topics(n_words=n_words, as_string=True)
if show_counts:
self._check_build()
topic_counts = sorted([ (k, topics[k], len(v)) for k,v in self.topic_dict.items()],
topic_counts = sorted([ (k, topics[k], len(v)) for k,v in self.topic_dict.items()],
key=lambda kv:kv[-1], reverse=True)
for (idx, topic, count) in topic_counts:
print("topic:%s | count:%s | %s" %(idx, count, topic))
Expand All @@ -371,20 +368,20 @@ def build(self, texts):

self.topic_dict = self._rank_documents(texts, doc_topics=doc_topics)
return





def get_docs(self, topic_ids=[], doc_ids=[], rank=False):
"""
Returns document entries for supplied topic_ids.
Returns document entries for supplied topic_ids.
"""
self._check_build()
if not topic_ids:
topic_ids = list(range(self.n_topics))
result_texts = []
for topic_id in topic_ids:
if topic_id not in self.topic_dict: continue
texts = [{'text':tup[0], 'doc_id':tup[1], 'topic_proba':tup[2], 'topic_id':topic_id} for tup in self.topic_dict[topic_id]
texts = [{'text':tup[0], 'doc_id':tup[1], 'topic_proba':tup[2], 'topic_id':topic_id} for tup in self.topic_dict[topic_id]
if not doc_ids or tup[1] in doc_ids]
result_texts.extend(texts)
if not rank:
Expand Down Expand Up @@ -439,10 +436,10 @@ def _rank_documents(self,
result_dict[topic_id] = lst
return result_dict


def _check_build(self):
self._check_model()
if self.topic_dict is None:
if self.topic_dict is None:
raise Exception('Must call build() method.')


Expand Down
Loading

0 comments on commit f41cba2

Please sign in to comment.