Skip to content

Commit

Permalink
Plotting, loading etc.
Browse files Browse the repository at this point in the history
  • Loading branch information
onurgu committed Jan 21, 2018
1 parent ccfe338 commit d5edee3
Show file tree
Hide file tree
Showing 4 changed files with 176 additions and 24 deletions.
3 changes: 3 additions & 0 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@ gensim = "*"
nltk = "*"
ipython = "*"
matplotlib = "*"
data-hacks = "*"
seaborn = "*"
jupyter = "*"


[requires]
Expand Down
2 changes: 1 addition & 1 deletion construct_vocab.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ def combine_files_in_the_pdf_directory(filepath):

lda.print_topics(20)

lda.save("tbmm_lda.model")
lda.save(args.corpus_filename + ".tbmm_lda.model")



30 changes: 23 additions & 7 deletions corpus_loader.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,13 @@
import configparser
import tbmmcorpus
import tbmmcorpus

DEV=False
TOPIC_DISTRIBUTIONS=False
TOPIC_DISTRIBUTIONS=True

if DEV:
corpus_filepath = "corpus-dev/tbmm_corpus"
else:
corpus_filepath = "corpus-v0.2/tbmm_corpus.mm"

config_parser = configparser.ConfigParser()
config_parser.read("config.ini")
Expand All @@ -11,21 +16,32 @@

corpus = tbmmcorpus.TbmmCorpus(metadata=True, config=config)
if DEV:
corpus.load_tbmm_corpus("corpus-dev/tbmm_corpus")
corpus.load_tbmm_corpus(corpus_filepath)
else:
corpus.load_tbmm_corpus("corpus-v0.1/tbmm_corpus.mm")
corpus.load_tbmm_corpus(corpus_filepath)

corpus.prepare_metadata_to_description_dictionary()
corpus.generate_word_counts()

if TOPIC_DISTRIBUTIONS:

if DEV:
lda_model_path = "corpus-dev/tbmm_corpus.tbmm_lda.model"
else:
lda_model_path = "corpus-v0.2/tbmm_lda.model.passes_100"

corpus.prepare_metadata_to_description_dictionary()

corpus.generate_word_counts()

from gensim.models.ldamodel import LdaModel
lda = LdaModel.load("corpus-v0.1/tbmm_lda.model")
# lda = LdaModel.load("corpus-v0.1/tbmm_lda.model")
if DEV:
lda = LdaModel.load(lda_model_path)
else:
lda = LdaModel.load(lda_model_path)

topic_dist_matrix, label_vector = corpus.calculate_topic_distributions_of_all_documents(lda)

for topic_no in range(1, 20):
corpus.plot_topic_across_time(topic_no, topic_dist_matrix, label_vector)
# for topic_no in range(1, 20):
# corpus.plot_topic_across_time(topic_no, topic_dist_matrix, label_vector)
165 changes: 149 additions & 16 deletions tbmmcorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,8 @@ def __init__(self, input=None, dictionary=None, metadata=False, character_filter

self.config = config

self.date_mappings = {}

@staticmethod
def filter_extremes(dictionary_object, no_below=5, no_above=0.5, keep_n=100000, keep_tokens=None):
"""
Expand Down Expand Up @@ -253,6 +255,11 @@ def load_tbmm_corpus(self, fname):

self.dictionary = self.dictionary.load_from_text(fname + ".vocabulary.txt")

import pickle
with open(fname + '.date_mappings.pkl', 'rb') as f:
self.date_mappings = pickle.load(f)


@staticmethod
def get_document_topics(corpus, lda, document):
"""
Expand Down Expand Up @@ -362,31 +369,35 @@ def plot_word_freqs_given_a_regexp(self, regexp_to_select_keywords, keyword="def
format=format)
return plot_values, counts, total_count, all_keywords

def plot_word_freqs_given_a_regexp_for_each_year(self, lo_regexp_to_select_keywords, keyword="default", format="pdf"):
def plot_word_freqs_given_a_regexp_for_each_year(self, lo_regexp_to_select_keywords, legend_labels, keyword="default", format="pdf"):
fig = plt.figure(figsize=(16, 9), dpi=300)
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
linestyles = ['-', '--', '-.', ':']
legends = []
for regexp_to_select_keywords in lo_regexp_to_select_keywords:
handles = []
for idx, regexp_to_select_keywords in enumerate(lo_regexp_to_select_keywords):
donem_dict_normalized, counts, total_count, all_keywords = self._word_freqs_given_a_regexp_for_each_year(regexp_to_select_keywords)
plot_values = donem_dict_normalized
plot_values = sorted(donem_dict_normalized.items(), key=lambda x: x[0])
linestyle = linestyles.pop()
plt.plot([x[0] for x in plot_values] , [x[1] for x in plot_values],
label=regexp_to_select_keywords,
linestyle=linestyle)
line, = plt.plot([x[0] for x in plot_values], [x[1] for x in plot_values],
label=legend_labels[idx],
linestyle=linestyle)
handles += [line]
legends.append(regexp_to_select_keywords)

#plt.xticks(range(0, len(plot_values), 100),
# [plot_values[i][0].split("/")[1] for i in range(0, len(plot_values), 100)],
# rotation='vertical')

plt.legend(handles=handles)

#plt.margins(0.2)
plt.subplots_adjust(bottom=0.15)
filename = os.path.join(self.config["plots_dir"], keyword+"_normalized")
fig.savefig(filename + "." + format)
import ipdb ; ipdb.set_trace()
# import ipdb ; ipdb.set_trace()

def _word_freqs_given_a_regexp_for_each_year(self, regexp_to_select_keywords):
"""
Expand Down Expand Up @@ -451,30 +462,91 @@ def calculate_topic_distributions_of_all_documents(self, lda):
n_topics = lda.num_topics
topic_dist_matrix = []
label_vector = []
for idx, (doc_id, document_bow) in enumerate(self.documents_word_counts.items()):

unsorted_filepaths = [(doc_id, x['filepath']) for doc_id, x in self.documents_metadata.items() if
re.match(r"^(tbmm|tbt|mgk)/", x['filepath'])]

for idx, (doc_id, filepath) in enumerate(unsorted_filepaths):
document_bow = self.documents_word_counts[doc_id]
topic_dist = lda.get_document_topics(document_bow)
topic_dist_full_vector = [0] * n_topics
for topic_id, prob in topic_dist:
topic_dist_full_vector[topic_id] = prob
topic_dist_matrix += [topic_dist_full_vector]
label_vector += [self.documents_metadata[doc_id]['filepath']]
label_vector += [filepath]

return topic_dist_matrix, label_vector

def plot_topic_by_year(self, topic_no, topic_dist_matrix, label_vector, format="pdf"):
import ipdb ; ipdb.set_trace()
sorted_zipped_topic_dist_matrix = sorted(zip(topic_dist_matrix, label_vector),
key=cmp_to_key(self.compare_two_document_labels))
# def plot_topic_by_year(self, topic_no, topic_dist_matrix, label_vector, format="pdf"):
# # import ipdb ; ipdb.set_trace()
# fig = plt.figure()
# sorted_zipped_topic_dist_matrix = sorted(zip(topic_dist_matrix, label_vector),
# key=cmp_to_key(self.compare_two_document_labels))
#
# tbmm_topic_dist_matrix = sorted_zipped_topic_dist_matrix
#
# plot_values = [(value[1], value[0][topic_no]) for id, value in enumerate(tbmm_topic_dist_matrix)]
#
# plt.plot([x[0] for x in plot_values] , [x[1] for x in plot_values], label="Topic %d" % topic_no)
# plt.subplots_adjust(bottom=0.15)
# filename = os.path.join(self.config["plots_dir"], "topic_%d" % topic_no)
# fig.savefig(filename + "." + format)

tbmm_topic_dist_matrix = sorted_zipped_topic_dist_matrix
def plot_a_specific_topic_by_year(self, topics, topic_dist_matrix, label_vector, legend_labels, keyword="default_topic", format="pdf"):
fig = plt.figure(figsize=(16, 9), dpi=300)
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
linestyles = ['-', '--', '-.', ':']
markerstyles = ['+', '.', 'o', 'v', '^']

plot_values = [(value[1], value[0][topic_no]) for id, value in enumerate(tbmm_topic_dist_matrix)]
handles = []
for idx, topic_no in enumerate(topics):

donem_dict_normalized = self._get_topic_normalized_for_each_year(topic_no,
topic_dist_matrix,
label_vector)

plot_values = sorted(donem_dict_normalized.items(), key=lambda x: x[0])
if idx < len(linestyles):
linestyle = linestyles[-(idx+1)]
markerstyle = ""
else:
linestyle = linestyles[0]
markerstyle = markerstyles[-(idx+1)]

line, = plt.plot([x[0] for x in plot_values], [x[1] for x in plot_values],
label=legend_labels[idx],
linestyle=linestyle,
marker=markerstyle)
handles += [line]

plt.plot([x[0] for x in plot_values] , [x[1] for x in plot_values], label="Topic %d" % topic_no)
#plt.xticks(range(0, len(plot_values), 100),
# [plot_values[i][0].split("/")[1] for i in range(0, len(plot_values), 100)],
# rotation='vertical')

plt.legend(handles=handles)

#plt.margins(0.2)
plt.subplots_adjust(bottom=0.15)
filename = os.path.join(self.config["plots_dir"], "topic_%d" % topic_no)
filename = os.path.join(self.config["plots_dir"], keyword+"_normalized")
fig.savefig(filename + "." + format)
# import ipdb ; ipdb.set_trace()

def _get_topic_normalized_for_each_year(self, topic_no, topic_dist_matrix, label_vector):

donem_dict = dd(int)
donem_doc_count = dd(int)
donem_dict_normalized = dd(int)

for idx, label in enumerate(label_vector):
term_str = label.split("/")[1]
donem_dict[term_str] += topic_dist_matrix[idx][topic_no]
donem_doc_count[term_str] += 1

for term in donem_dict.keys():
donem_dict_normalized[year_mapping[term]] = donem_dict[term] / donem_doc_count[term]

return donem_dict_normalized

def plot_topic_across_time(self, topic_no, topic_dist_matrix, label_vector, format="pdf"):

Expand Down Expand Up @@ -555,6 +627,65 @@ def prepare_metadata_to_description_dictionary(self):

self.compare_two_document_labels = _compare_two_document_labels(coded_filepaths)

def calculate_intervals(self):

dates = {}
for k, v in self.date_mappings.items():
_key = int(v['interval'][0][-4:])
if _key in dates:
dates[_key].append(k)
else:
dates[_key] = [k]

for k, v in dates.items():
for i in range(len(v)):
if v[i].startswith('tbt-ty') and len(v[i]) == 7:
v[i] = v[i][:6] + '0' + v[i][6:]

for k, v in dates.items():
for i in range(len(v)):
if v[i].startswith('cs-ty') and len(v[i]) == 6:
v[i] = v[i][:5] + '0' + v[i][5:]

years = sorted(dates.keys())
change_points = [1923, 1938, 1946, 1960, 1980, 1991, 2002]

codes = {1923: []}
point = 0
for year in years:
if year < (change_points[point + 1] if point + 1 < len(change_points) else 5000):
codes[change_points[point]] += dates[year]
else:
point += 1
codes[change_points[point]] = []

metadata2id = {v['filepath']: k for k, v in self.documents_metadata.items()}

temp = {}
for k, v in metadata2id.items():
_key = k.split('/')[1]
if _key in temp:
temp[_key].append(v)
else:
temp[_key] = [v]
metadata2id = temp

merged_dates = {}
for date, arr in codes.items():
for code in arr:
if date in merged_dates:
if code in metadata2id:
merged_dates[date] += metadata2id[code]
else:
print('{} not exists in metadata!'.format(code))
else:
if code in metadata2id:
merged_dates[date] = metadata2id[code]
else:
print('{} not exists in metadata!'.format(code))

self.documents_date_groups = merged_dates


def prepare_for_analysis():
import configparser
Expand Down Expand Up @@ -587,6 +718,8 @@ def prepare_for_analysis():

corpus.plot_word_freqs_given_a_regexp(r"^lokavt", keyword="lokavt")

corpus.plot_word_freqs_given_a_regexp(r"^mebus", keyword="mebus")




Expand Down

0 comments on commit d5edee3

Please sign in to comment.