nbdev migration commit

amaiya · Jun 15, 2024 · f41cba2 · f41cba2
1 parent b616989
commit f41cba2
Show file tree

Hide file tree

Showing 70 changed files with 1,099 additions and 13,541 deletions.
diff --git a/.gitconfig b/.gitconfig
diff --git a/.github/workflows/deploy.yaml b/.github/workflows/deploy.yaml
@@ -0,0 +1,14 @@
+name: Deploy to GitHub Pages
+
+permissions:
+  contents: write
+  pages: write
+
+on:
+  push:
+    branches: [ "main", "master" ]
+  workflow_dispatch:
+jobs:
+  deploy:
+    runs-on: ubuntu-latest
+    steps: [uses: fastai/workflows/quarto-ghp@master]
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
@@ -0,0 +1,7 @@
+name: CI
+on:  [workflow_dispatch, pull_request, push]
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    steps: [uses: fastai/workflows/nbdev-ci@master]
diff --git a/.gitignore b/.gitignore
@@ -127,3 +127,6 @@ dmypy.json
 
 # Pyre type checker
 .pyre/
+_docs/
+_proc/
+
diff --git a/Makefile b/Makefile
diff --git a/causalnlp/_modidx.py b/causalnlp/_modidx.py
diff --git a/causalnlp/analyzers.py b/causalnlp/analyzers.py
@@ -1,9 +1,9 @@
-# AUTOGENERATED! DO NOT EDIT! File to edit: nbs/02_analyzers.ipynb (unless otherwise specified).
+# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/02_analyzers.ipynb.
 
-__all__ = ['list2chunks', 'ZeroShotClassifier', 'TextEncoder', 'TopicModel', 'DEFAULT_TOKEN_PATTERN']
-
-# Cell
+# %% auto 0
+__all__ = ['DEFAULT_TOKEN_PATTERN', 'list2chunks', 'ZeroShotClassifier', 'TextEncoder', 'TopicModel']
 
+# %% ../nbs/02_analyzers.ipynb 4
 import math
 import warnings
 import numpy as np
@@ -13,8 +13,7 @@ def list2chunks(a, n):
     k, m = divmod(len(a), n)
     return (a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n))
 
-# Cell
-
+# %% ../nbs/02_analyzers.ipynb 5
 class ZeroShotClassifier():
     """
     Interface to Zero Shot Topic Classifier
@@ -61,22 +60,22 @@ def predict(self, docs, labels=[], include_labels=False, multilabel=True,
                            if len(topic_strings) is large.
           - nli_template(str): labels are inserted into this template for use as hypotheses in natural language inference
           - topic_strings(list): alias for labels parameter for backwards compatibility
-
+          
         **Returns:**
-
-
+        
+        
           inferred probabilities or list of inferred probabilities if doc is list
         """
 
         # error checks
         is_str_input = False
-        if not isinstance(docs, (list, np.ndarray)):
+        if not isinstance(docs, (list, np.ndarray)): 
             docs = [docs]
             is_str_input = True
         if not isinstance(docs[0], str): raise ValueError('docs must be string or a list of strings representing document(s)')
         if len(labels) > 0 and len(topic_strings) > 0: raise ValueError('labels and topic_strings are mutually exclusive')
         if not labels and not topic_strings: raise ValueError('labels must be a list of strings')
-        if topic_strings:
+        if topic_strings: 
             labels = topic_strings
 
 
@@ -117,8 +116,7 @@ def predict(self, docs, labels=[], include_labels=False, multilabel=True,
         if is_str_input: scores = scores[0]
         return scores
 
-# Cell
-
+# %% ../nbs/02_analyzers.ipynb 10
 #from sentence_transformers import SentenceTransformer, util
 
 class TextEncoder():
@@ -146,16 +144,15 @@ def __init__(self, model_name='stsb-roberta-large', device=None):
         self.torch_device = device
         if self.torch_device is None: self.torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
         self.model = SentenceTransformer(model_name)
-
+        
     def encode(self, texts, batch_size=32, normalize=False, show_progress_bar=False):
         """Generate embedding for supplied text"""
         if isinstance(texts, str): texts = [texts]
         return self.model.encode(texts, batch_size=batch_size,
                                  show_progress_bar=show_progress_bar, normalize_embeddings=normalize,
                                  convert_to_tensor=False, device=self.torch_device )
 
-# Cell
-
+# %% ../nbs/02_analyzers.ipynb 16
 from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
 from sklearn.decomposition import NMF, LatentDirichletAllocation
 import math
@@ -166,7 +163,7 @@ def encode(self, texts, batch_size=32, normalize=False, show_progress_bar=False)
 class TopicModel():
 
 
-    def __init__(self,texts=None, n_topics=None, n_features=10000,
+    def __init__(self,texts=None, n_topics=None, n_features=10000, 
                  min_df=5, max_df=0.5,  stop_words='english',
                  model_type='lda',
                  lda_max_iter=5, lda_mode='online',
@@ -176,7 +173,7 @@ def __init__(self,texts=None, n_topics=None, n_features=10000,
         """
         Fits a topic model to documents in <texts>.
         Example:
-            tm = ktrain.text.get_topic_model(docs, n_topics=20,
+            tm = ktrain.text.get_topic_model(docs, n_topics=20, 
                                             n_features=1000, min_df=2, max_df=0.95)
         Args:
             texts (list of str): list of texts
@@ -191,7 +188,7 @@ def __init__(self,texts=None, n_topics=None, n_features=10000,
                                 If lda_mode='batch', this should be increased (e.g., 1500).
                                 Ignored if model_type != 'lda'
             lda_mode (str):  one of {'online', 'batch'}. Ignored if model_type !='lda'
-            token_pattern(str): regex pattern to use to tokenize documents.
+            token_pattern(str): regex pattern to use to tokenize documents. 
             verbose(bool): verbosity
         """
         self.verbose=verbose
@@ -208,7 +205,7 @@ def __init__(self,texts=None, n_topics=None, n_features=10000,
         if texts is not None:
             (model, vectorizer) = self.train(texts, model_type=model_type,
                                              n_topics=n_topics, n_features=n_features,
-                                             min_df = min_df, max_df = max_df,
+                                             min_df = min_df, max_df = max_df, 
                                              stop_words=stop_words,
                                              lda_max_iter=lda_max_iter, lda_mode=lda_mode,
                                              token_pattern=token_pattern,
@@ -258,7 +255,7 @@ def train(self,texts, model_type='lda', n_topics=None, n_features=10000,
         vectorizer = CountVectorizer(max_df=max_df, min_df=min_df,
                                  max_features=n_features, stop_words=stop_words,
                                  token_pattern=token_pattern, ngram_range=ngram_range)
-
+        
 
         x_train = vectorizer.fit_transform(texts)
 
@@ -321,7 +318,7 @@ def get_word_weights(self, topic_id, n_words=100):
         Returns a list tuples of the form: (word, weight) for given topic_id.
         """
         self._check_model()
-        if topic_id+1 > len(self.model.components_):
+        if topic_id+1 > len(self.model.components_): 
             raise ValueError('topic_id must be less than %s' % (len(self.model.components_)))
         feature_names = self.vectorizer.get_feature_names()
         word_probs = self.model.components_[topic_id]
@@ -352,7 +349,7 @@ def print_topics(self, n_words=10, show_counts=False):
         topics = self.get_topics(n_words=n_words, as_string=True)
         if show_counts:
             self._check_build()
-            topic_counts = sorted([ (k, topics[k], len(v)) for k,v in self.topic_dict.items()],
+            topic_counts = sorted([ (k, topics[k], len(v)) for k,v in self.topic_dict.items()], 
                                     key=lambda kv:kv[-1], reverse=True)
             for (idx, topic, count) in topic_counts:
                 print("topic:%s | count:%s | %s" %(idx, count, topic))
@@ -371,20 +368,20 @@ def build(self, texts):
 
         self.topic_dict = self._rank_documents(texts, doc_topics=doc_topics)
         return
+
 
-
-
+
     def get_docs(self, topic_ids=[], doc_ids=[], rank=False):
         """
-        Returns document entries for supplied topic_ids.
+        Returns document entries for supplied topic_ids.           
         """
         self._check_build()
         if not topic_ids:
             topic_ids = list(range(self.n_topics))
         result_texts = []
         for topic_id in topic_ids:
             if topic_id not in self.topic_dict: continue
-            texts = [{'text':tup[0], 'doc_id':tup[1], 'topic_proba':tup[2], 'topic_id':topic_id} for tup in self.topic_dict[topic_id]
+            texts = [{'text':tup[0], 'doc_id':tup[1], 'topic_proba':tup[2], 'topic_id':topic_id} for tup in self.topic_dict[topic_id] 
                                                                                                      if not doc_ids or tup[1] in doc_ids]
             result_texts.extend(texts)
         if not rank:
@@ -439,10 +436,10 @@ def _rank_documents(self,
             result_dict[topic_id] = lst
         return result_dict
 
-
+    
     def _check_build(self):
         self._check_model()
-        if self.topic_dict is None:
+        if self.topic_dict is None: 
             raise Exception('Must call build() method.')