-
Notifications
You must be signed in to change notification settings - Fork 1
/
histogram_maker.py
86 lines (79 loc) · 3.14 KB
/
histogram_maker.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
"""
This file creates and stores histograms of a source text.
This is a preamble to a tweet generator assignment for CS 1.2.
"""
def get_source_text(file_name):
# import text file and clean it of all punctuation
punctuations = """!@#$%^&*()_-+=[]{}\|;:,./<>?`~"""
source = open(file_name, "r").read().replace("\n", "").split()
# checking word for punctuation and removing it
for word in source:
for character in word:
if character in punctuations:
word_index = source.index(word)
source.pop(word_index)
# create new string and add it back to the source text
new_word = word.replace(character, "")
source.insert(word_index, new_word)
word = new_word
continue
else:
pass
return source
def histogram(source_text):
# takes in source text and returns a histogram in the form of a list of lists
initial_histogram = [] # empty list to store histogram
# checks each word in the source text
for word in source_text:
# lists that will go inside of histogram list
count_and_word = []
# counts up the words in the source text and adds count and word to list
word_count = source_text.count(word)
count_and_word.append(word_count)
count_and_word.append(word)
# then adds the smaller list to the histogram list
initial_histogram.append(count_and_word)
# removes duplicates from initial_histogram and adds to new histogram list
histogram = []
for list in initial_histogram:
if list not in histogram:
histogram.append(list)
else:
pass
# returns a list of lists
return histogram
def unique_words(histogram):
# this function counts up the first occurance of all words
unique_words_count = 0
# iterate over the whole list
for gram in histogram:
# increment our unique_words_count
unique_words_count += 1
# return the count as an integer
return unique_words_count
def frequency(word, histogram):
# counts up the number of times that a word appears
frequency_counter = 0
# iterate over the histogram list
for gram in histogram:
# check if the first element is the same as the word parameter
if gram[1] == word:
# if it is, then increment the frequency_counter by the value of the zeroth element
frequency_counter += gram[0]
else:
pass
return frequency_counter
def logger(logger_file, data_structure, histogram, unique_words, word, frequency):
f = open(logger_file, "a")
f.write("""
Data Structure Type: {}
Histogram: {}
Number of Unique Words: {}
Frequency of the word '{}': {}
""".format(data_structure, histogram, unique_words, word, frequency))
if __name__ in '__main__':
source_text = get_source_text('souls_of_black_folk.txt')
histogram = histogram(source_text)
unique_words = unique_words(histogram)
frequency = frequency("light", histogram)
logger("histogram_logger.txt", "List of lists", histogram, unique_words, "light", frequency)