You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
I'm training a top2vec model using universal sentence encoder and I'm getting Type Error: 'numpy.float64' object cannot be interpreted as an integer
All my input text is string type, I've quadruple checked and this has worked before so I'm at a loss as to what is causing this error to be thrown or where it's initiating from. Is it coming from hdbscan?
TypeError Traceback (most recent call last)
Cell In[12], line 2
1 # Train the top2vec model
----> 2 model = Top2Vec(df.Q10.values, embedding_model='universal-sentence-encoder') #, embedding_model='universal-sentence-encoder'
File ~/opt/anaconda3/envs/top2vec/lib/python3.10/site-packages/top2vec/Top2Vec.py:666, in Top2Vec.__init__(self, documents, min_count, topic_merge_delta, ngram_vocab, ngram_vocab_args, embedding_model, embedding_model_path, embedding_batch_size, split_documents, document_chunker, chunk_length, max_num_chunks, chunk_overlap_ratio, chunk_len_coverage_ratio, sentencizer, speed, use_corpus_file, document_ids, keep_documents, workers, tokenizer, use_embedding_model_tokenizer, umap_args, hdbscan_args, verbose)
663 else:
664 raise ValueError(f"{embedding_model} is an invalid embedding model.")
--> 666 self.compute_topics(umap_args=umap_args, hdbscan_args=hdbscan_args, topic_merge_delta=topic_merge_delta)
668 # initialize document indexing variables
669 self.document_index = None
File ~/opt/anaconda3/envs/top2vec/lib/python3.10/site-packages/top2vec/Top2Vec.py:1266, in Top2Vec.compute_topics(self, umap_args, hdbscan_args, topic_merge_delta)
1261 if hdbscan_args is None:
1262 hdbscan_args = {'min_cluster_size': 15,
1263 'metric': 'euclidean',
1264 'cluster_selection_method': 'eom'}
-> 1266 cluster = hdbscan.HDBSCAN(**hdbscan_args).fit(umap_model.embedding_)
1268 # calculate topic vectors from dense areas of documents
1269 logger.info('Finding topics')
File ~/opt/anaconda3/envs/top2vec/lib/python3.10/site-packages/hdbscan/hdbscan_.py:1205, in HDBSCAN.fit(self, X, y)
1195 kwargs.pop("prediction_data", None)
1196 kwargs.update(self._metric_kwargs)
1198 (
1199 self.labels_,
1200 self.probabilities_,
1201 self.cluster_persistence_,
1202 self._condensed_tree,
1203 self._single_linkage_tree,
1204 self._min_spanning_tree,
-> 1205 ) = hdbscan(clean_data, **kwargs)
1207 if self.metric != "precomputed" and not self._all_finite:
1208 # remap indices to align with original data in the case of non-finite entries.
1209 self._condensed_tree = remap_condensed_tree(
1210 self._condensed_tree, internal_to_raw, outliers
1211 )
File ~/opt/anaconda3/envs/top2vec/lib/python3.10/site-packages/hdbscan/hdbscan_.py:884, in hdbscan(X, min_cluster_size, min_samples, alpha, cluster_selection_epsilon, max_cluster_size, metric, p, leaf_size, algorithm, memory, approx_min_span_tree, gen_min_span_tree, core_dist_n_jobs, cluster_selection_method, allow_single_cluster, match_reference_implementation, **kwargs)
867 else:
868 (single_linkage_tree, result_min_span_tree) = memory.cache(
869 _hdbscan_boruvka_balltree
870 )(
(...)
880 **kwargs
881 )
883 return (
--> 884 _tree_to_labels(
885 X,
886 single_linkage_tree,
887 min_cluster_size,
888 cluster_selection_method,
889 allow_single_cluster,
890 match_reference_implementation,
891 cluster_selection_epsilon,
892 max_cluster_size,
893 )
894 + (result_min_span_tree,)
895 )
File ~/opt/anaconda3/envs/top2vec/lib/python3.10/site-packages/hdbscan/hdbscan_.py:80, in _tree_to_labels(X, single_linkage_tree, min_cluster_size, cluster_selection_method, allow_single_cluster, match_reference_implementation, cluster_selection_epsilon, max_cluster_size)
78 condensed_tree = condense_tree(single_linkage_tree, min_cluster_size)
79 stability_dict = compute_stability(condensed_tree)
---> 80 labels, probabilities, stabilities = get_clusters(
81 condensed_tree,
82 stability_dict,
83 cluster_selection_method,
84 allow_single_cluster,
85 match_reference_implementation,
86 cluster_selection_epsilon,
87 max_cluster_size,
88 )
90 return (labels, probabilities, stabilities, condensed_tree, single_linkage_tree)
File hdbscan/_hdbscan_tree.pyx:659, in hdbscan._hdbscan_tree.get_clusters()
File hdbscan/_hdbscan_tree.pyx:733, in hdbscan._hdbscan_tree.get_clusters()
TypeError: 'numpy.float64' object cannot be interpreted as an integer
The text was updated successfully, but these errors were encountered:
I'm training a top2vec model using universal sentence encoder and I'm getting Type Error:
'numpy.float64' object cannot be interpreted as an integer
All my input text is string type, I've quadruple checked and this has worked before so I'm at a loss as to what is causing this error to be thrown or where it's initiating from. Is it coming from hdbscan?
The text was updated successfully, but these errors were encountered: