Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update TextFeatureSelection.py #33

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 13 additions & 13 deletions build/lib/TextFeatureSelection.py
Original file line number Diff line number Diff line change
Expand Up @@ -494,7 +494,7 @@ def _cost_function_value(self,y_test,y_test_pred,cost_function,avrg):
return metric


def _computeFitness(self,gene,unique_words,x,y,model,model_metric,avrg,analyzer,min_df,max_df,stop_words,tokenizer,token_pattern,lowercase):
def _computeFitness(self,gene,unique_words,x,y,model,model_metric,avrg,analyzer,min_df,max_df,stop_words,tokenizer,token_pattern,lowercase,vocabulary,ngram_range):
### create tfidf matrix for only terms which are in gene
# get terms from gene and vocabulary combnation
term_to_use=list(np.array(unique_words)[list(map(bool,gene))])
Expand All @@ -511,7 +511,7 @@ def _computeFitness(self,gene,unique_words,x,y,model,model_metric,avrg,analyzer,
y_train, y_test = np.array(y)[train_index],np.array(y)[test_index]

##based on vocabulary set, create tfidf matrix for train and test data
tfidf=TfidfVectorizer(vocabulary=term_to_use,analyzer=analyzer,min_df=min_df,max_df=max_df,stop_words=stop_words,tokenizer=tokenizer,token_pattern=token_pattern,lowercase=lowercase)
tfidf=TfidfVectorizer(vocabulary=term_to_use,analyzer=analyzer,min_df=min_df,max_df=max_df,stop_words=stop_words,tokenizer=tokenizer,token_pattern=token_pattern,lowercase=lowercase,vocabulary=vocabulary,ngram_range=ngram_range)
tfidfvec_vectorizer=tfidf.fit(X_train)

#get x train and test
Expand Down Expand Up @@ -564,7 +564,7 @@ def _get_population(self,population,population_matrix,population_array):
return population_matrix


def _get_parents(self,population_array,population_matrix,unique_words,x,y,model,model_metric,avrg,analyzer,min_df,max_df,stop_words,tokenizer,token_pattern,lowercase):
def _get_parents(self,population_array,population_matrix,unique_words,x,y,model,model_metric,avrg,analyzer,min_df,max_df,stop_words,tokenizer,token_pattern,lowercase,vocabulary,ngram_range):

#keep space for best chromosome
parents = np.empty((0,population_array.shape[0]))
Expand All @@ -587,15 +587,15 @@ def _get_parents(self,population_array,population_matrix,unique_words,x,y,model,
##gene pool 1
gene_1 = population_matrix[index_run[0]]
#cost of gene 1
cost1=self._computeFitness(gene=gene_1,unique_words=unique_words,x=x,y=y,model=model,model_metric=model_metric,avrg=avrg,analyzer=analyzer,min_df=min_df,max_df=max_df,stop_words=stop_words,tokenizer=tokenizer,token_pattern=token_pattern,lowercase=lowercase)
cost1=self._computeFitness(gene=gene_1,unique_words=unique_words,x=x,y=y,model=model,model_metric=model_metric,avrg=avrg,analyzer=analyzer,min_df=min_df,max_df=max_df,stop_words=stop_words,tokenizer=tokenizer,token_pattern=token_pattern,lowercase=lowercase,vocabulary=vocabulary,ngram_range=ngram_range)
##gene pool 2
gene_2 = population_matrix[index_run[1]]
#cost of gene 2
cost2=self._computeFitness(gene=gene_2,unique_words=unique_words,x=x,y=y,model=model,model_metric=model_metric,avrg=avrg,analyzer=analyzer,min_df=min_df,max_df=max_df,stop_words=stop_words,tokenizer=tokenizer,token_pattern=token_pattern,lowercase=lowercase)
cost2=self._computeFitness(gene=gene_2,unique_words=unique_words,x=x,y=y,model=model,model_metric=model_metric,avrg=avrg,analyzer=analyzer,min_df=min_df,max_df=max_df,stop_words=stop_words,tokenizer=tokenizer,token_pattern=token_pattern,lowercase=lowercase,vocabulary=vocabulary,ngram_range=ngram_range)
##gene pool 3
gene_3 = population_matrix[index_run[2]]
#cost of gene 3
cost3=self._computeFitness(gene=gene_3,unique_words=unique_words,x=x,y=y,model=model,model_metric=model_metric,avrg=avrg,analyzer=analyzer,min_df=min_df,max_df=max_df,stop_words=stop_words,tokenizer=tokenizer,token_pattern=token_pattern,lowercase=lowercase)
cost3=self._computeFitness(gene=gene_3,unique_words=unique_words,x=x,y=y,model=model,model_metric=model_metric,avrg=avrg,analyzer=analyzer,min_df=min_df,max_df=max_df,stop_words=stop_words,tokenizer=tokenizer,token_pattern=token_pattern,lowercase=lowercase,vocabulary=vocabulary,ngram_range=ngram_range)

#get best chromosome from 3 and assign best chromosome.
if cost1==max(cost1,cost2,cost3):
Expand Down Expand Up @@ -681,7 +681,7 @@ def _mutation(self,child,prob_mutation):
t = t+1
return mutated_child

def _getPopulationAndMatrix(self,doc_list,label_list,analyzer,min_df,max_df,stop_words,tokenizer,token_pattern,lowercase):
def _getPopulationAndMatrix(self,doc_list,label_list,analyzer,min_df,max_df,stop_words,tokenizer,token_pattern,lowercase,vocabulary,ngram_range):
#get null free df
temp_df=pd.DataFrame({'doc_list':doc_list,'label_list':label_list})
temp_df=temp_df[(~temp_df['doc_list'].isna()) & (~temp_df['label_list'].isna())]
Expand All @@ -692,7 +692,7 @@ def _getPopulationAndMatrix(self,doc_list,label_list,analyzer,min_df,max_df,stop
gc.collect()

#get unique tokens
tfidfvec = TfidfVectorizer(analyzer=analyzer,min_df=min_df,max_df=max_df,stop_words=stop_words,tokenizer=tokenizer,token_pattern=token_pattern,lowercase=lowercase)
tfidfvec = TfidfVectorizer(analyzer=analyzer,min_df=min_df,max_df=max_df,stop_words=stop_words,tokenizer=tokenizer,token_pattern=token_pattern,vocabulary=vocabulary,ngram_range=ngram_range,lowercase=lowercase)
tfidfvec_vectorizer = tfidfvec.fit(doc_list)
unique_words=list(tfidfvec_vectorizer.vocabulary_.keys())

Expand All @@ -715,7 +715,7 @@ def _getPopulationAndMatrix(self,doc_list,label_list,analyzer,min_df,max_df,stop

return doc_list,label_list,unique_words,population_array,population_matrix,best_of_a_generation

def getGeneticFeatures(self,doc_list,label_list,model=LogisticRegression(),model_metric='f1',avrg='binary',analyzer='word',min_df=2,max_df=1.0,stop_words=None,tokenizer=None,token_pattern='(?u)\\b\\w\\w+\\b',lowercase=True):
def getGeneticFeatures(self,doc_list,label_list,model=LogisticRegression(),model_metric='f1',avrg='binary',analyzer='word',min_df=2,max_df=1.0,stop_words=None,tokenizer=None,token_pattern='(?u)\\b\\w\\w+\\b',lowercase=True,vocabulary=None,ngram_range=(3,5)):
'''
Data Parameters
----------
Expand Down Expand Up @@ -805,7 +805,7 @@ def getGeneticFeatures(self,doc_list,label_list,model=LogisticRegression(),model
avrg='binary'

#get all parameters needed for GA
doc_list,label_list,unique_words,population_array,population_matrix,best_of_a_generation=self._getPopulationAndMatrix(doc_list,label_list,analyzer=analyzer,min_df=min_df,max_df=max_df,stop_words=stop_words,tokenizer=tokenizer,token_pattern=token_pattern,lowercase=lowercase)
doc_list,label_list,unique_words,population_array,population_matrix,best_of_a_generation=self._getPopulationAndMatrix(doc_list,label_list,analyzer=analyzer,min_df=min_df,max_df=max_df,stop_words=stop_words,tokenizer=tokenizer,token_pattern=token_pattern,lowercase=lowercase,vocabulary=vocabulary,ngram_range=ngram_range)

#Execute GA
for genrtn in range(self.generations):
Expand All @@ -829,7 +829,7 @@ def getGeneticFeatures(self,doc_list,label_list,model=LogisticRegression(),model
# Doing it half the population size will mean getting matrix of population size equal to original matrix
for family in range(int(self.population/2)):
#get parents
parent1,parent2=self._get_parents(population_array=population_array,population_matrix=population_matrix,unique_words=unique_words,x=doc_list,y=label_list,model=model,model_metric=model_metric,avrg=avrg,analyzer=analyzer,min_df=min_df,max_df=max_df,stop_words=stop_words,tokenizer=tokenizer,token_pattern=token_pattern,lowercase=lowercase)
parent1,parent2=self._get_parents(population_array=population_array,population_matrix=population_matrix,unique_words=unique_words,x=doc_list,y=label_list,model=model,model_metric=model_metric,avrg=avrg,analyzer=analyzer,min_df=min_df,max_df=max_df,stop_words=stop_words,tokenizer=tokenizer,token_pattern=token_pattern,lowercase=lowercase,vocabulary=vocabulary,ngram_range=ngram_range)

#crossover
child1,child2=self._crossover(parent1=parent1,parent2=parent2,prob_crossover=self.prob_crossover)
Expand All @@ -839,8 +839,8 @@ def getGeneticFeatures(self,doc_list,label_list,model=LogisticRegression(),model
mutated_child2=self._mutation(child=child2,prob_mutation=self.prob_mutation)

#get cost function for 2 mutated child and print for generation, family and child
cost1=self._computeFitness(gene=mutated_child1,unique_words=unique_words,x=doc_list,y=label_list,model=model,model_metric=model_metric,avrg=avrg,analyzer=analyzer,min_df=min_df,max_df=max_df,stop_words=stop_words,tokenizer=tokenizer,token_pattern=token_pattern,lowercase=lowercase)
cost2=self._computeFitness(gene=mutated_child2,unique_words=unique_words,x=doc_list,y=label_list,model=model,model_metric=model_metric,avrg=avrg,analyzer=analyzer,min_df=min_df,max_df=max_df,stop_words=stop_words,tokenizer=tokenizer,token_pattern=token_pattern,lowercase=lowercase)
cost1=self._computeFitness(gene=mutated_child1,unique_words=unique_words,x=doc_list,y=label_list,model=model,model_metric=model_metric,avrg=avrg,analyzer=analyzer,min_df=min_df,max_df=max_df,stop_words=stop_words,tokenizer=tokenizer,token_pattern=token_pattern,lowercase=lowercase,vocabulary=vocabulary,ngram_range=ngram_range)
cost2=self._computeFitness(gene=mutated_child2,unique_words=unique_words,x=doc_list,y=label_list,model=model,model_metric=model_metric,avrg=avrg,analyzer=analyzer,min_df=min_df,max_df=max_df,stop_words=stop_words,tokenizer=tokenizer,token_pattern=token_pattern,lowercase=lowercase,vocabulary=vocabulary,ngram_range=ngram_range)

#create population for next generaion
new_population = np.vstack((new_population,mutated_child1,mutated_child2))
Expand Down