-
Notifications
You must be signed in to change notification settings - Fork 0
/
gender_classification.py
52 lines (35 loc) · 1.36 KB
/
gender_classification.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import random
from nltk import NaiveBayesClassifier
from nltk.classify import accuracy as nltk_accuracy
from nltk.corpus import names
"""
Extract last `N` letters from Word
"""
def feature_extractor(word, N=2):
last_n_letters = word[-N:]
return {'feature': last_n_letters.lower()}
def create_training_data():
male_list = [{name, 'male'} for name in names.words('male.txt')]
female_list = [{name, 'female'} for name in names.words('female.txt')]
return (male_list + female_list)
# Seed data using random number generator
random.seed(5)
#shuffle data
data = create_training_data()
random.shuffle(data)
# Create Test Data
input_names = ['Alexander', 'Danielle', 'DAvid', 'Cheryl']
# define number of smaples used for train and test
num_train = int(0.8 * len(data))
for i in range(1,6):
print ('\n Number of End Letters: ', i)
features = [(feature_extractor(n, i), gender) for (n,gender) in data]
# Seperate Data into training and test
train_data, test_data = features[:num_train],features[num_train:]
classifier = NaiveBayesClassifier.train(train_data)
# computer accuracy of NaiveBayesClassifier
accuracy = round(100 * nltk_accuracy(classifier, test_data), 2)
print('Accuracy= ' + str(accuracy) + '%')
# predict outputs
for name in input_names:
print(name, "==>", classifier.classify(feature_extractor(name, i)))