diff --git a/.gitignore b/.gitignore new file mode 100644 index 000000000..e95443b06 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +cnn_stories_tokenized/ +dm_stories_tokenized/ +finished_files/ diff --git a/make_datafiles.py b/make_datafiles.py index bb431d5f0..aa5f03a49 100644 --- a/make_datafiles.py +++ b/make_datafiles.py @@ -59,24 +59,24 @@ def chunk_all(): os.mkdir(chunks_dir) # Chunk the data for set_name in ['train', 'val', 'test']: - print "Splitting %s data into chunks..." % set_name + print("Splitting %s data into chunks..." % set_name) chunk_file(set_name) - print "Saved chunked data in %s" % chunks_dir + print("Saved chunked data in %s" % chunks_dir) def tokenize_stories(stories_dir, tokenized_stories_dir): """Maps a whole directory of .story files to a tokenized version using Stanford CoreNLP Tokenizer""" - print "Preparing to tokenize %s to %s..." % (stories_dir, tokenized_stories_dir) + print("Preparing to tokenize %s to %s..." % (stories_dir, tokenized_stories_dir)) stories = os.listdir(stories_dir) # make IO list file - print "Making list of files to tokenize..." + print("Making list of files to tokenize...") with open("mapping.txt", "w") as f: for s in stories: f.write("%s \t %s\n" % (os.path.join(stories_dir, s), os.path.join(tokenized_stories_dir, s))) command = ['java', 'edu.stanford.nlp.process.PTBTokenizer', '-ioFileList', '-preserveLines', 'mapping.txt'] - print "Tokenizing %i files in %s and saving in %s..." % (len(stories), stories_dir, tokenized_stories_dir) + print("Tokenizing %i files in %s and saving in %s..." % (len(stories), stories_dir, tokenized_stories_dir)) subprocess.call(command) - print "Stanford CoreNLP Tokenizer has finished." + print("Stanford CoreNLP Tokenizer has finished.") os.remove("mapping.txt") # Check that the tokenized stories directory contains the same number of files as the original directory @@ -84,7 +84,7 @@ def tokenize_stories(stories_dir, tokenized_stories_dir): num_tokenized = len(os.listdir(tokenized_stories_dir)) if num_orig != num_tokenized: raise Exception("The tokenized stories directory %s contains %i files, but it should contain the same number as %s (which has %i files). Was there an error during tokenization?" % (tokenized_stories_dir, num_tokenized, stories_dir, num_orig)) - print "Successfully finished tokenizing %s to %s.\n" % (stories_dir, tokenized_stories_dir) + print("Successfully finished tokenizing %s to %s.\n" % (stories_dir, tokenized_stories_dir)) def read_text_file(text_file): @@ -98,7 +98,7 @@ def read_text_file(text_file): def hashhex(s): """Returns a heximal formated SHA1 hash of the input string.""" h = hashlib.sha1() - h.update(s) + h.update(s.encode()) return h.hexdigest() @@ -149,7 +149,7 @@ def get_art_abs(story_file): def write_to_bin(url_file, out_file, makevocab=False): """Reads the tokenized .story files corresponding to the urls listed in the url_file and writes them to a out_file.""" - print "Making bin file for URLs listed in %s..." % url_file + print("Making bin file for URLs listed in %s..." % url_file) url_list = read_text_file(url_file) url_hashes = get_url_hashes(url_list) story_fnames = [s+".story" for s in url_hashes] @@ -161,7 +161,7 @@ def write_to_bin(url_file, out_file, makevocab=False): with open(out_file, 'wb') as writer: for idx,s in enumerate(story_fnames): if idx % 1000 == 0: - print "Writing story %i of %i; %.2f percent done" % (idx, num_stories, float(idx)*100.0/float(num_stories)) + print("Writing story %i of %i; %.2f percent done" % (idx, num_stories, float(idx)*100.0/float(num_stories))) # Look in the tokenized story dirs to find the .story file corresponding to this url if os.path.isfile(os.path.join(cnn_tokenized_stories_dir, s)): @@ -169,9 +169,9 @@ def write_to_bin(url_file, out_file, makevocab=False): elif os.path.isfile(os.path.join(dm_tokenized_stories_dir, s)): story_file = os.path.join(dm_tokenized_stories_dir, s) else: - print "Error: Couldn't find tokenized story file %s in either tokenized story directories %s and %s. Was there an error during tokenization?" % (s, cnn_tokenized_stories_dir, dm_tokenized_stories_dir) + print("Error: Couldn't find tokenized story file %s in either tokenized story directories %s and %s. Was there an error during tokenization?" % (s, cnn_tokenized_stories_dir, dm_tokenized_stories_dir)) # Check again if tokenized stories directories contain correct number of files - print "Checking that the tokenized stories directories %s and %s contain correct number of files..." % (cnn_tokenized_stories_dir, dm_tokenized_stories_dir) + print("Checking that the tokenized stories directories %s and %s contain correct number of files..." % (cnn_tokenized_stories_dir, dm_tokenized_stories_dir)) check_num_stories(cnn_tokenized_stories_dir, num_expected_cnn_stories) check_num_stories(dm_tokenized_stories_dir, num_expected_dm_stories) raise Exception("Tokenized stories directories %s and %s contain correct number of files but story file %s found in neither." % (cnn_tokenized_stories_dir, dm_tokenized_stories_dir, s)) @@ -181,8 +181,8 @@ def write_to_bin(url_file, out_file, makevocab=False): # Write to tf.Example tf_example = example_pb2.Example() - tf_example.features.feature['article'].bytes_list.value.extend([article]) - tf_example.features.feature['abstract'].bytes_list.value.extend([abstract]) + tf_example.features.feature['article'].bytes_list.value.extend([article.encode()]) + tf_example.features.feature['abstract'].bytes_list.value.extend([abstract.encode()]) tf_example_str = tf_example.SerializeToString() str_len = len(tf_example_str) writer.write(struct.pack('q', str_len)) @@ -198,15 +198,15 @@ def write_to_bin(url_file, out_file, makevocab=False): tokens = [t for t in tokens if t!=""] # remove empty vocab_counter.update(tokens) - print "Finished writing file %s\n" % out_file + print("Finished writing file %s\n" % out_file) # write vocab to file if makevocab: - print "Writing vocab file..." + print("Writing vocab file...") with open(os.path.join(finished_files_dir, "vocab"), 'w') as writer: for word, count in vocab_counter.most_common(VOCAB_SIZE): writer.write(word + ' ' + str(count) + '\n') - print "Finished writing vocab file" + print("Finished writing vocab file") def check_num_stories(stories_dir, num_expected): @@ -217,7 +217,7 @@ def check_num_stories(stories_dir, num_expected): if __name__ == '__main__': if len(sys.argv) != 3: - print "USAGE: python make_datafiles.py " + print("USAGE: python make_datafiles.py ") sys.exit() cnn_stories_dir = sys.argv[1] dm_stories_dir = sys.argv[2]