diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..d967c08 --- /dev/null +++ b/.gitignore @@ -0,0 +1,261 @@ +*.log +python_overnight.sh +java_overnight.sh + +.idea +sample_data + + +# Created by https://www.toptal.com/developers/gitignore/api/pycharm+all,python +# Edit at https://www.toptal.com/developers/gitignore?templates=pycharm+all,python + +### PyCharm+all ### +# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider +# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 + +# User-specific stuff +.idea/**/workspace.xml +.idea/**/tasks.xml +.idea/**/usage.statistics.xml +.idea/**/dictionaries +.idea/**/shelf + +# AWS User-specific +.idea/**/aws.xml + +# Generated files +.idea/**/contentModel.xml + +# Sensitive or high-churn files +.idea/**/dataSources/ +.idea/**/dataSources.ids +.idea/**/dataSources.local.xml +.idea/**/sqlDataSources.xml +.idea/**/dynamic.xml +.idea/**/uiDesigner.xml +.idea/**/dbnavigator.xml + +# Gradle +.idea/**/gradle.xml +.idea/**/libraries + +# Gradle and Maven with auto-import +# When using Gradle or Maven with auto-import, you should exclude module files, +# since they will be recreated, and may cause churn. Uncomment if using +# auto-import. +# .idea/artifacts +# .idea/compiler.xml +# .idea/jarRepositories.xml +# .idea/modules.xml +# .idea/*.iml +# .idea/modules +# *.iml +# *.ipr + +# CMake +cmake-build-*/ + +# Mongo Explorer plugin +.idea/**/mongoSettings.xml + +# File-based project format +*.iws + +# IntelliJ +out/ + +# mpeltonen/sbt-idea plugin +.idea_modules/ + +# JIRA plugin +atlassian-ide-plugin.xml + +# Cursive Clojure plugin +.idea/replstate.xml + +# SonarLint plugin +.idea/sonarlint/ + +# Crashlytics plugin (for Android Studio and IntelliJ) +com_crashlytics_export_strings.xml +crashlytics.properties +crashlytics-build.properties +fabric.properties + +# Editor-based Rest Client +.idea/httpRequests + +# Android studio 3.1+ serialized cache file +.idea/caches/build_file_checksums.ser + +### PyCharm+all Patch ### +# Ignores the whole .idea folder and all .iml files +# See https://github.com/joeblau/gitignore.io/issues/186 and https://github.com/joeblau/gitignore.io/issues/360 + +.idea/* + +# Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-249601023 + +*.iml +modules.xml +.idea/misc.xml +*.ipr + +# Sonarlint plugin +.idea/sonarlint + +### Python ### +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintainted in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ + +# End of https://www.toptal.com/developers/gitignore/api/pycharm+all,python \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..d56066d --- /dev/null +++ b/Dockerfile @@ -0,0 +1,52 @@ +FROM nvcr.io/nvidia/tensorflow:22.01-tf2-py3 +# The Tag 22.01 stands for january 2022. +# The version 22.01 comes with tensorflow 2.6 and python 3.8 + +# Entry-Point Orchestration-Flags +ENV DO_TRAIN="true" +ENV DO_TEST="true" +ENV DO_COMPLETION="true" + +ENV VERBOSE="false" + +# Filenames +ENV TRAIN_FILE=java_training_slp_pre_enc_bpe_10000 +ENV VALIDATION_FILE=java_validation_slp_pre_enc_bpe_10000 +ENV TEST_FILE=java_test_slp_pre_enc_bpe_10000 +ENV TEST_PROJ_NAMES_FILE=testProjects +ENV ID_MAP_FILE=/data/java/id_map_java_test_slp_pre_bpe_10000 +# Directory that contains train/validation/test data etc. +ENV DATA_HOME=/data/java/ +# Directory in which the model will be saved. +ENV MODEL_DIR=/models/java/model + +# Maximum training epochs +ENV EPOCHS=2 +# Initial learning rate +ENV LR=0.1 +# Training batch size +ENV BATCH_SIZE=32 +# RNN unroll timesteps for gradient calculation. +ENV STEPS=200 +# 1 - Dropout probability +ENV KEEP_PROB=0.5 +# RNN hidden state size +ENV STATE_DIMS=512 +# Checkpoint and validation loss calculation frequency. +ENV CHECKPOINT_EVERY=5000 + + +WORKDIR /openvocabcodenlm + +COPY reduced_requirements.txt . +RUN pip install -r reduced_requirements.txt + +COPY util util +COPY reader.py . +COPY code_nlm.py . +COPY create_subtoken_data.py . +COPY non-ascii_sequences_to_unk.py . + +COPY entrypoint.sh . + +ENTRYPOINT ["bash","entrypoint.sh"] \ No newline at end of file diff --git a/README.md b/README.md index fecab8a..65cdd8c 100644 --- a/README.md +++ b/README.md @@ -1,170 +1,83 @@ -# OpenVocabNLMs -Contains the code for our ICSE 2020 submission: open vocabulary language model for source code that uses the byte pair encoding algorithm to learn a segmentation of code tokens into subtokens. +# OpenVocabCodeNLM - Reproduction & Bump -If you use our code/implementation, datasets or pre-trained models please cite our paper: -@inproceedings{Karampatsis2020ICSE,\ -author = {Karampatsis, Rafael - Michael and Babii, Hlib and Robbes, Romain and Sutton, Charles and Janes, Andrea},\ -title = {{Big Code != Big Vocabulary: Open-Vocabulary Models for Source code}},\ -year = {2020},\ -publisher = {ACM},\ -url = {https://doi.org/10.1145/3377811.3380342}, \ -doi = {10.1145/3377811.3380342},\ -booktitle = {Proceedings of the 42nd International Conference on Software Engineering},\ -pages = {},\ -numpages = {11},\ -location = {Seoul, South Korea},\ -series = {ICSE ’20}\ -} +This is an reproduction and adjustment of OpenVocabNLM. +I ran into issues mostly with different versions and GPU-drivers, +so this repository aims to bump the versions to be SOTA again, +with modern python, tensorflow and docker. +Also see the [original Readme](./original_README.md). +See the [original repository](https://github.com/mast-group/OpenVocabCodeNLM) -# Code Structure -**non-ascii_sequences_to_unk.py** is a preprocessing script that can be used to remove non-ascii sequences from the data and replace them with a special symbol. -**create_subtoken_data.py** is also a preprocessing script that can be used to subtokenize data based on the heuristic of [Allamanis et al. (2015)](https://miltos.allamanis.com/publications/2015suggesting/). +## Changes -**reader.py** contains utility functions for reading data and providing batches for training and testing of models. +1. Ran [Tensorflow Migration Skript](https://blog.tensorflow.org/2019/02/upgrading-your-code-to-tensorflow-2-0.html) +2. Adjusted the re-Shape for cost function, as a different format was required in tfa +3. Adjusted the reshape for cost function for completion and perplexity separately +4. Some prints (might be removed ...) +5. Adjusted the loss-functions default behavior to not average out over batch (done later manually) +6. Added DockerFile & Reduced Requirements +7. Changed some prints to be logging -**code_nlm.py** contains the implementation of our NLM for code and supports training, perplexity/cross-entropy calculation, code-completion simulation as well as dynamic versions of the test scenarios. The updated implementation has also some new features, previously not present in the code. That is measuring identifier specific performance for code completion. Another new feature implements a simple n-gram cache for identifiers that better simulates use of the model in an IDE where such information would be present. In order to use the identifier features a file containing identifier information must be provided through the options. +## Environment -# Installation +- Windows 10 +- Cuda 11.6 +- Python 3.9.9 +- Tensorflow 2.6 -Python>2.7.6 or Python==3.6 is required! -Python>3.6 is not supported due to the tensorflow version not supporting it. +I further had to manually (!) install a matching keras with pip: -```shell script -git clone https://github.com/mast-group/OpenVocabCodeNLM -cd OpenVocabCodeNLM -pip install -r requirements.txt #python2 -pip3 install -r requirements.txt #python3 +``` +pip install keras==2.6 ``` -The experiments in the paper were performed using Python 2.7.14 but we have currently not experienced any unresolved issue with Python 3.
-In case you encounter any issues please open a new issue entry. +This repository has **two** requirements.txt files - one is for windows, while the *reduced_requirements.txt* is for the docker-container. +The precise windows versions where not available for the docker-ubuntu. -# Usage Instructions -If you want to try the implementation unzip the directory containing the sample data. -The sample data contain the small training set, validation, and test set used in the paper with a BPE encdoding size of 10000. +**Optionally**: +This repository provides a docker file starting from the official NVidia Tensorflow Image. +This should utilize a properly set up GPU on a linux machine - other OS are not supported! +For any non-supported OS, or insufficiently set up Linux machines, it will default using CPUs. +This is printed when the container starts. +The examples were created and run with Docker 20.10 and Docker-compose v2.2.3 . -## Option Constants -Let's first define constants for pointing to the data and network parameters. You'll need to modify these to point to your own data and satisfy the hyperparameters that you want to use. -``` -# Directory that contains train/validation/test data etc. -DATA_HOME=sample_data/java/ -# Directory in which the model will be saved. -MODEL_DIR=sample_data/java/model -mkdir $MODEL_DIR - -# Filenames -TRAIN_FILE=java_training_slp_pre_enc_bpe_10000 -VALIDATION_FILE=java_validation_slp_pre_enc_bpe_10000 -TEST_FILE=java_test_slp_pre_enc_bpe_10000 -TEST_PROJ_NAMES_FILE=testProjects -ID_MAP_FILE=sample_data/java/id_map_java_test_slp_pre_bpe_10000 - -# Maximum training epochs -EPOCHS=5 # Normally this would be larger. For instance 30-50 -# Initial learning rate -LR=0.1 # This is the default value. You can skip it if you don't want to change it. -# Training batch size -BATCH_SIZE=32 # This is also the default. -# RNN unroll timesteps for gradient calculation. -STEPS=20 # 20-50 is a good range of values for dynamic experiments. -# 1 - Dropout probability -KEEP_PROB=0.5 # This is also the default. -# RNN hidden state size -STATE_DIMS=512 # This is also the default. -# Checkpoint and validation loss calculation frequency. -CHECKPOINT_EVERY=5000 # This is also the default. - - -# Understanding boolean options. -# Most boolean options are set to False by default. -# For using any boolean option set it to True. -# For instance for using a GRU instead of an LSTM add to your command the option --gru True. -``` +The containers are **very memory hungry**. Limiting resources in the compose is highly advised. +## Licence Warning -We next present the various scenarios supported by our implementation. +The original OpenVocabCodeNLM has Apache Licence (same as this fork). -## Training -The training scenario creates a global model by training on the provided to it training data. -We will train a Java model with a BPE encoding of 10000 using the sample data. -In the following training example we set some of the hyperparameters (to their default values though). -Optionally, you can set all of them to your intented values. -Since the data is tokenized into subwords we need to let the script know so that it can calculate the metrics correctly. -For this reason we need to set the *word_level_perplexity* flag to **True**. -In order to also output validation cross-entropy instead of perplexity we set the *cross_entropy* option to **True**. +But the used nvidia-container comes with an implicit licence agreement. Please study it carefully before using it. -``` -# Train a small java model for 1 epoch. -python code_nlm.py --data_path $DATA_HOME --train_dir $MODEL_DIR --train_filename $TRAIN_FILE --validation_filename $VALIDATION_FILE --gru True --hidden_size $STATE_DIMS --batch_size $BATCH_SIZE --word_level_perplexity True --cross_entropy True --steps_per_checkpoint $CHECKPOINT_EVERY --max_epoch $EPOCHS - -# Because we are using the default values we could shorten the above command to: -# python code_nlm.py --data_path $DATA_HOME --train_dir $MODEL_DIR --train_filename $TRAIN_FILE --validation_filename $VALIDATION_FILE --gru True --word_level_perplexity True --cross_entropy True --max_epoch $EPOCHS -``` +## Troubleshooting -## Test Scenarios -### Test Entropy Calculation -``` -# Testing the model (Calculating test set entropy) -python code_nlm.py --test True --data_path $DATA_HOME --train_dir $MODEL_DIR --test_filename $TEST_FILE --gru True --batch_size $BATCH_SIZE --word_level_perplexity True --cross_entropy True -``` +### Warnings / Errors in Python -#### Dynamically Adapt the Model on Test Data -In order to dynamically adapt the model, the implementation needs to know when it is testing on a new project, so that it can revert the model back to the global one. -This is achieved via the *test_proj_filename* option. -``` -# Batch size must always be set to 1 for this scenario! We are going through every file seperately. -# In an IDE this could instead be sped up through some engineering. -python code_nlm.py --dynamic_test True --data_path $DATA_HOME --train_dir $MODEL_DIR --test_filename $TEST_FILE --gru True --batch_size 1 --word_level_perplexity True --cross_entropy True --test_proj_filename $TEST_PROJ_NAMES_FILE --num_steps $STEPS -``` +**Allocation exceeds free system memory** -### Test Code Completion -In this scenario the *batch_size* option is used to set the beam size. ``` -python code_nlm.py --completion True --data_path $DATA_HOME --train_dir $MODEL_DIR --test_filename $TEST_FILE --gru True --batch_size $BATCH_SIZE +[...] Allocation of 234393600 exceeds 10% of free system memory. +[...] Allocation of 234393600 exceeds 10% of free system memory. +[...] Allocation of 234393600 exceeds 10% of free system memory. +[...] ``` +This error is likely related to the Batchsize. It can occur in or outside of docker. +**Try reducing the batchsize**. +For older graphics cards try ~64, for non-gpu try batch sizes from 16 upward. -#### Dynamic Code Completion on Test Data -Similarly to before we need to set the *test_proj_filename* option. -``` -python code_nlm.py --completion True --dynamic True --data_path $DATA_HOME --train_dir $MODEL_DIR --test_filename $TEST_FILE --gru True --batch_size $BATCH_SIZE --test_proj_filename $TEST_PROJ_NAMES_FILE --num_steps $STEPS -``` +### Warnings in/with Docker -#### Dynamic Code Completion on Test Data and Measuring Identifier Specific Performance -To run this experiment you need to provide a file containing a mapping that lets the implementation know for each subtoken whether it is part of an identifier or not. -This information would easily be present in an IDE. -The mapping is provided via the *identifier_map* option. -``` -python code_nlm.py --completion True --dynamic True --data_path $DATA_HOME --train_dir $MODEL_DIR --test_filename $TEST_FILE --gru True --batch_size $BATCH_SIZE --test_proj_filename $TEST_PROJ_NAMES_FILE --identifier_map $ID_MAP_FILE --num_steps $STEPS -``` +**FileNotFoundError** -#### Adding a Simple Identifier n-gram Cache -In an IDE setting we could improve the performance on identifiers by utilizing a simple n-gram cache for identifiers that we have already encountered. -The *file_cache_weight* and *cache_order* options can be used to control the cache's weight and the cache's order respectively. -By default we use a 6-gram with a weight of 0.2. ``` -python code_nlm.py --completion True --dynamic True --data_path $DATA_HOME --train_dir $MODEL_DIR --test_filename $TEST_FILE --gru True --batch_size $BATCH_SIZE --test_proj_filename $TEST_PROJ_NAMES_FILE --identifier_map $ID_MAP_FILE --cache_ids True --num_steps $STEPS +openvocabcodenlm-experiment-1 | FileNotFoundError: [Errno 2] No such file or directory: '/data/java/java_test_slp_pre_enc_bpe_10000' ``` -### Predictability -Similar to testing but calculates the average entropy of the files instead of the per token one. - - - -# Preprocessing - -## BPE -The BPE implementation used can be found here: https://github.com/rsennrich/subword-nmt - -To apply byte pair encoding to word segmentation, invoke these commands: -``` -subword-nmt learn-bpe -s {num_operations} < {train_file} > {codes_file} -subword-nmt apply-bpe -c {codes_file} < {test_file} > {out_file} -``` -num_operations = The number of BPE ops e.g., 10000
-train_file = The file on which to learn the encoding
-codes_file = The file in which to output the learned encoding
-test_file = The file to segment with the learned encoding
-out_file = The file in which to save the now segmented test_file
+This is likely happening because there was a missmatch in mounting the volumes. +The way volume-paths are defined (e.g. ending with "/" *unrolls* the directory-elements into the mounted volume) in the docker-compose +must match the behavior of the python script. +**Solution:** First, add the `tail -f /dev/null` to the end of the `entrypoint.sh`. +Run the compose, and find your container with `docker ps`. Enter your docker container with `docker exec -it {ID} bash`. +Inspect the `/data` folder and see whether it matches your expectations, and adjust the values in the compose accordingly. diff --git a/code_nlm.py b/code_nlm.py index 37e5f17..08693e8 100644 --- a/code_nlm.py +++ b/code_nlm.py @@ -1,5 +1,5 @@ # some structure based on https://github.com/wpm/tfrnnlm/blob/master/tfrnnlm/rnn.py -#https://github.com/tensorflow/tensorflow/pull/2580/files#diff-083dd112b4600ecbaf63b2070951aad8 +# https://github.com/tensorflow/tensorflow/pull/2580/files#diff-083dd112b4600ecbaf63b2070951aad8 from __future__ import print_function @@ -24,6 +24,7 @@ import numpy as np import tensorflow as tf +import tensorflow_addons as tfa import reader # BPE imports @@ -31,8 +32,7 @@ # from subword_nmt.apply_bpe import BPE, read_vocabulary - -flags = tf.flags +flags = tf.compat.v1.flags # Path options flags.DEFINE_string("data_path", None, "Path to folder containing training/test data.") flags.DEFINE_string("train_dir", None, "Output directory for saving the model.") @@ -40,8 +40,10 @@ # Scenario options. Training is default so, no option for it. flags.DEFINE_boolean("predict", False, "Set to True for computing predictability.") flags.DEFINE_boolean("test", False, "Set to True for computing test perplexity.") -flags.DEFINE_boolean("dynamic_test", False, "Set to True for performing dynamic train-testing perplexity calculation (only one train epoch).") -flags.DEFINE_boolean("maintenance_test", False, "Set to True for performing maintenance train-testing perplexity simulation (only one train epoch).") +flags.DEFINE_boolean("dynamic_test", False, + "Set to True for performing dynamic train-testing perplexity calculation (only one train epoch).") +flags.DEFINE_boolean("maintenance_test", False, + "Set to True for performing maintenance train-testing perplexity simulation (only one train epoch).") flags.DEFINE_boolean("completion", False, "Set to True to run code completion experiment.") flags.DEFINE_boolean("maintenance_completion", False, "Set to True to run maintenance code completion experiment") flags.DEFINE_boolean("dynamic", False, "Set to True to run dynamic code completion experiment.") @@ -54,7 +56,7 @@ flags.DEFINE_string("identifier_map", None, "The file that contains information about which tokens are identifiers.") flags.DEFINE_boolean("cache_ids", False, "Set to True to cache project identifiers during completion.") # flags.DEFINE_string("BPE", None, "The file containing the BPE encoding.") -flags.DEFINE_string("subtoken_map", None, "Contains the mapping from heyristic subtokens to tokens.") +flags.DEFINE_string("subtoken_map", None, "Contains the mapping from heuristic subtokens to tokens.") # flags.DEFINE_string("output_probs_file", "predictionProbabilities.txt", "The file to store output probabilities.") @@ -64,7 +66,8 @@ flags.DEFINE_float("keep_prob", 0.5, "Keep probability = 1.0 - dropout probability.") flags.DEFINE_integer("vocab_size", 25000, "Vocabulary size") flags.DEFINE_boolean("gru", False, "Use a GRU cell. Must be set to True to use a GRU, otherwise an LSTM will be used.") -flags.DEFINE_integer("steps_per_checkpoint", 5000, "Number of steps for printing stats (validation is run) and checkpointing the model. Must be increased by 'a lot' for large training corpora.") +flags.DEFINE_integer("steps_per_checkpoint", 5000, + "Number of steps for printing stats (validation is run) and checkpointing the model. Must be increased by 'a lot' for large training corpora.") flags.DEFINE_integer("max_epoch", 30, "Max number training epochs to run.") flags.DEFINE_integer("batch_size", 32, "Batch size") flags.DEFINE_integer("test_batch_size", 10, "Batch size during predictability test") @@ -88,171 +91,214 @@ "True for Allamanis et al. heuristic subtoken model.") flags.DEFINE_boolean("verbose", False, "Verbose for completion.") - FLAGS = flags.FLAGS + def data_type(): - """ + """ Returns the TF floating point type used for operations. :return: The data type used (tf.float32) """ - return tf.float32 + return tf.float32 + def get_gpu_config(): - gconfig = tf.ConfigProto() - gconfig.gpu_options.per_process_gpu_memory_fraction = 0.975 # Don't take 100% of the memory - gconfig.allow_soft_placement = True # Does not aggressively take all the GPU memory - gconfig.gpu_options.allow_growth = True # Take more memory when necessary - return gconfig + gconfig = tf.compat.v1.ConfigProto() + gconfig.gpu_options.per_process_gpu_memory_fraction = 0.975 # Don't take 100% of the memory + gconfig.allow_soft_placement = True # Does not aggressively take all the GPU memory + gconfig.gpu_options.allow_growth = True # Take more memory when necessary + return gconfig + class NLM(object): - def __init__(self, config): - """ + def __init__(self, config): + """ Initializes the neural language model based on the specified configation. + + The migration from tf 1.X to 2.X changed the behavior of the sequence to sequence loss-function. + On the one hand it changed position, from being in within tensorflow to being part of tensorflow-addons. + On the other hand, the default behavior changed to average accross batches - a behaviour that the initial authors + already had covered. This resulted in a loss that was "way to small" and a model not really learning. + This could be fixed by disabling the default behaviour of the loss function. + Another problem was the re-shaping of elements, as the 1.X behaviour could + work around switching vector-sizes by using "None" in the right places of shape. + This is not applicable anymore for 2.X (atleast in the same way) and has hence been hardcoded to + match the specified parameters. + + In general, if something within this file breaks / looks bad, I would first suspect the loss function. + :param config: The configuration to be used for initialization. """ - self.num_layers = config.num_layers - self.batch_size = batch_size = config.batch_size - self.num_steps = num_steps = config.num_steps - self.hidden_size = hidden_size = config.hidden_size - self.vocab_size = vocab_size = config.vocab_size - #self.predictions_file = config.output_probs_file - self.global_step = tf.Variable(0, trainable=False) - - with tf.name_scope("Parameters"): - # Sets dropout and learning rate. - self.learning_rate = tf.placeholder(tf.float32, name="learning_rate") - self.keep_probability = tf.placeholder(tf.float32, name="keep_probability") - - with tf.name_scope("Input"): - self.inputd = tf.placeholder(tf.int64, shape=(batch_size, None), name="inputd") - self.targets = tf.placeholder(tf.int64, shape=(batch_size, None), name="targets") - self.target_weights = tf.placeholder(tf.float32, shape=(batch_size, None), name="tgtweights") - - with tf.device("/cpu:0"): - with tf.name_scope("Embedding"): - # Initialize embeddings on the CPU and add dropout layer after embeddings. - self.embedding = tf.Variable(tf.random_uniform((vocab_size, hidden_size), -config.init_scale, config.init_scale), dtype=data_type(), name="embedding") - self.embedded_inputds = tf.nn.embedding_lookup(self.embedding, self.inputd, name="embedded_inputds") - self.embedded_inputds = tf.nn.dropout(self.embedded_inputds, self.keep_probability) - - with tf.name_scope("RNN"): - # Definitions for the different cells that can be used. Either lstm or GRU which will be wrapped with dropout. - def lstm_cell(): - if 'reuse' in inspect.getargspec(tf.contrib.rnn.BasicLSTMCell.__init__).args: - return tf.contrib.rnn.BasicLSTMCell(hidden_size, forget_bias=0.0, state_is_tuple=True, reuse=tf.get_variable_scope().reuse) - else: - return tf.contrib.rnn.BasicLSTMCell(hidden_size, forget_bias=0.0, state_is_tuple=True) - def gru_cell(): - if 'reuse' in inspect.getargspec(tf.contrib.rnn.GRUCell.__init__).args: - return tf.contrib.rnn.GRUCell(hidden_size, reuse=tf.get_variable_scope().reuse) - else: - return tf.contrib.rnn.GRUCell(hidden_size) - def drop_cell(): - if FLAGS.gru: - return tf.contrib.rnn.DropoutWrapper(gru_cell(), output_keep_prob=self.keep_probability) - else: - return tf.contrib.rnn.DropoutWrapper(lstm_cell(), output_keep_prob=self.keep_probability) - - # Allows multiple layers to be used. Not advised though. - rnn_layers = tf.contrib.rnn.MultiRNNCell([drop_cell() for _ in range(self.num_layers)], state_is_tuple=True) - # Initialize the state to zero. - self.reset_state = rnn_layers.zero_state(batch_size, data_type()) - self.outputs, self.next_state = tf.nn.dynamic_rnn(rnn_layers, self.embedded_inputds, time_major=False, - initial_state=self.reset_state) - - with tf.name_scope("Cost"): - # Output and loss function calculation - self.output = tf.reshape(tf.concat(axis=0, values=self.outputs), [-1, hidden_size]) - self.softmax_w = tf.get_variable("softmax_w", [hidden_size, vocab_size], dtype=data_type()) - self.softmax_b = tf.get_variable("softmax_b", [vocab_size], dtype=data_type()) - self.logits = tf.matmul(self.output, self.softmax_w) + self.softmax_b - self.loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example( - [self.logits], [tf.reshape(self.targets, [-1])], [tf.reshape(self.target_weights, [-1])]) - self.cost = tf.div(tf.reduce_sum(self.loss), batch_size, name="cost") - self.final_state = self.next_state - - self.norm_logits = tf.nn.softmax(self.logits) - - with tf.name_scope("Train"): - self.iteration = tf.Variable(0, dtype=data_type(), name="iteration", trainable=False) - tvars = tf.trainable_variables() - self.gradients, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), - config.max_grad_norm, name="clip_gradients") - optimizer = tf.train.GradientDescentOptimizer(self.learning_rate) - self.train_step = optimizer.apply_gradients(zip(self.gradients, tvars), name="train_step", - global_step=self.global_step) - self.validation_perplexity = tf.Variable(dtype=data_type(), initial_value=float("inf"), - trainable=False, name="validation_perplexity") - tf.summary.scalar(self.validation_perplexity.op.name, self.validation_perplexity) - self.training_epoch_perplexity = tf.Variable(dtype=data_type(), initial_value=float("inf"), - trainable=False, name="training_epoch_perplexity") - tf.summary.scalar(self.training_epoch_perplexity.op.name, self.training_epoch_perplexity) - - self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=None) - self.initialize = tf.initialize_all_variables() - self.summary = tf.summary.merge_all() - - - def get_parameter_count(self, debug=False): - """ + self.num_layers = config.num_layers + self.batch_size = batch_size = config.batch_size + self.num_steps = num_steps = config.num_steps + self.hidden_size = hidden_size = config.hidden_size + self.vocab_size = vocab_size = config.vocab_size + # self.predictions_file = config.output_probs_file + self.global_step = tf.Variable(0, trainable=False) + + with tf.compat.v1.name_scope("Parameters"): + # Sets dropout and learning rate. + self.learning_rate = tf.compat.v1.placeholder(tf.float32, name="learning_rate") + self.keep_probability = tf.compat.v1.placeholder(tf.float32, name="keep_probability") + + with tf.compat.v1.name_scope("Input"): + self.inputd = tf.compat.v1.placeholder(tf.int64, shape=(batch_size, None), name="inputd") + self.targets = tf.compat.v1.placeholder(tf.int64, shape=(batch_size, None), name="targets") + self.target_weights = tf.compat.v1.placeholder(tf.float32, shape=(batch_size, None), name="tgtweights") + + with tf.device("/cpu:0"): + with tf.compat.v1.name_scope("Embedding"): + # Initialize embeddings on the CPU and add dropout layer after embeddings. + self.embedding = tf.Variable( + tf.random.uniform((vocab_size, hidden_size), -config.init_scale, config.init_scale), + dtype=data_type(), name="embedding") + self.embedded_inputds = tf.nn.embedding_lookup(params=self.embedding, ids=self.inputd, + name="embedded_inputds") + self.embedded_inputds = tf.nn.dropout(self.embedded_inputds, rate=1 - (self.keep_probability)) + + with tf.compat.v1.name_scope("RNN"): + # Definitions for the different cells that can be used. Either lstm or GRU which will be wrapped with dropout. + def lstm_cell(): + if 'reuse' in inspect.getargspec(tf.compat.v1.nn.rnn_cell.BasicLSTMCell.__init__).args: + return tf.compat.v1.nn.rnn_cell.BasicLSTMCell(hidden_size, forget_bias=0.0, state_is_tuple=True, + reuse=tf.compat.v1.get_variable_scope().reuse) + else: + return tf.compat.v1.nn.rnn_cell.BasicLSTMCell(hidden_size, forget_bias=0.0, state_is_tuple=True) + + def gru_cell(): + if 'reuse' in inspect.getargspec(tf.compat.v1.nn.rnn_cell.GRUCell.__init__).args: + return tf.compat.v1.nn.rnn_cell.GRUCell(hidden_size, reuse=tf.compat.v1.get_variable_scope().reuse) + else: + return tf.compat.v1.nn.rnn_cell.GRUCell(hidden_size) + + def drop_cell(): + if FLAGS.gru: + return tf.compat.v1.nn.rnn_cell.DropoutWrapper(gru_cell(), output_keep_prob=self.keep_probability) + # return tf.contrib.rnn.DropoutWrapper(gru_cell(), output_keep_prob=self.keep_probability) + else: + return tf.compat.v1.nn.rnn_cell.DropoutWrapper(lstm_cell(), output_keep_prob=self.keep_probability) + # return tf.contrib.rnn.DropoutWrapper(lstm_cell(), output_keep_prob=self.keep_probability) + + # Allows multiple layers to be used. Not advised though. + rnn_layers = tf.compat.v1.nn.rnn_cell.MultiRNNCell([drop_cell() for _ in range(self.num_layers)], + state_is_tuple=True) + # Initialize the state to zero. + self.reset_state = rnn_layers.zero_state(batch_size, data_type()) + self.outputs, self.next_state = tf.compat.v1.nn.dynamic_rnn(rnn_layers, self.embedded_inputds, + time_major=False, + initial_state=self.reset_state) + + with tf.compat.v1.name_scope("Cost"): + # Output and loss function calculation + self.output = tf.reshape(tf.concat(axis=0, values=self.outputs), [-1, hidden_size]) + self.softmax_w = tf.compat.v1.get_variable("softmax_w", [hidden_size, vocab_size], dtype=data_type()) + self.softmax_b = tf.compat.v1.get_variable("softmax_b", [vocab_size], dtype=data_type()) + self.logits = tf.matmul(self.output, self.softmax_w) + self.softmax_b + + # See here : https://www.tensorflow.org/addons/api_docs/python/tfa/seq2seq/sequence_loss + # Old Interface: https://docs.w3cub.com/tensorflow~python/tf/contrib/legacy_seq2seq/sequence_loss_by_example + + big_shape = [self.batch_size, self.num_steps, self.vocab_size] + small_shape = [self.batch_size, self.num_steps] + # This cost function is used for both test and validation tasks, + # The perplexity measure and the completion measure. + if FLAGS.completion: + big_shape = [self.batch_size, 1, self.vocab_size] + small_shape = [self.batch_size, 1] + + self.loss = tfa.seq2seq.loss.sequence_loss( + logits=tf.reshape(self.logits, big_shape), + targets=tf.reshape(self.targets, small_shape), + weights=tf.reshape(self.target_weights, small_shape), + average_across_batch=False, + sum_over_batch=False, + sum_over_timesteps=False, + average_across_timesteps=False + ) + + self.cost = tf.math.divide(tf.reduce_sum(input_tensor=self.loss), batch_size, name="cost") + self.final_state = self.next_state + + self.norm_logits = tf.nn.softmax(self.logits) + + with tf.compat.v1.name_scope("Train"): + self.iteration = tf.Variable(0, dtype=data_type(), name="iteration", trainable=False) + tvars = tf.compat.v1.trainable_variables() + self.gradients, _ = tf.clip_by_global_norm(tf.gradients(ys=self.cost, xs=tvars), + config.max_grad_norm, name="clip_gradients") + optimizer = tf.compat.v1.train.GradientDescentOptimizer(self.learning_rate) + self.train_step = optimizer.apply_gradients(zip(self.gradients, tvars), name="train_step", + global_step=self.global_step) + self.validation_perplexity = tf.Variable(dtype=data_type(), initial_value=float("inf"), + trainable=False, name="validation_perplexity") + tf.compat.v1.summary.scalar(self.validation_perplexity.op.name, self.validation_perplexity) + self.training_epoch_perplexity = tf.Variable(dtype=data_type(), initial_value=float("inf"), + trainable=False, name="training_epoch_perplexity") + tf.compat.v1.summary.scalar(self.training_epoch_perplexity.op.name, self.training_epoch_perplexity) + + self.saver = tf.compat.v1.train.Saver(tf.compat.v1.global_variables(), max_to_keep=None) + self.initialize = tf.compat.v1.initialize_all_variables() + self.summary = tf.compat.v1.summary.merge_all() + + def get_parameter_count(self, debug=False): + """ Counts the number of parameters required by the model. :param debug: Whether debugging information should be printed. :return: Returns the number of parameters required for the model. """ - params = tf.trainable_variables() - total_parameters = 0 - for variable in params: - shape = variable.get_shape() - variable_parameters = 1 - for dim in shape: - variable_parameters *= dim.value - if debug: - print(variable) - print(shape + "\t" + str(len(shape)) + "\t" + str(variable_parameters)) - total_parameters += variable_parameters - return total_parameters - - @property - def reset_state(self): - return self._reset_state - - @reset_state.setter - def reset_state(self, x): - self._reset_state = x - - @property - def cost(self): - return self._cost - - @cost.setter - def cost(self, y): - self._cost = y - - @property - def final_state(self): - return self._final_state - - @final_state.setter - def final_state(self, z): - self._final_state = z - - @property - def learning_rate(self): - return self._lr - - @learning_rate.setter - def learning_rate(self, l): - self._lr = l - - @property - def input(self): - return self.data - - - def train(self, session, config, train_data, exit_criteria, valid_data, summary_dir): - """ + params = tf.compat.v1.trainable_variables() + total_parameters = 0 + for variable in params: + shape = variable.get_shape() + variable_parameters = 1 + for dim in shape: + variable_parameters *= dim + if debug: + print(variable) + print(shape + "\t" + str(len(shape)) + "\t" + str(variable_parameters)) + total_parameters += variable_parameters + return total_parameters + + @property + def reset_state(self): + return self._reset_state + + @reset_state.setter + def reset_state(self, x): + self._reset_state = x + + @property + def cost(self): + return self._cost + + @cost.setter + def cost(self, y): + self._cost = y + + @property + def final_state(self): + return self._final_state + + @final_state.setter + def final_state(self, z): + self._final_state = z + + @property + def learning_rate(self): + return self._lr + + @learning_rate.setter + def learning_rate(self, l): + self._lr = l + + @property + def input(self): + return self.data + + def train(self, session, config, train_data, exit_criteria, valid_data, summary_dir): + """ Trains the NLM with the specified configuration, training, and validation data. Training is terminated when the specified criteria have been satisfied. :param session: The TF session in which operations should be run. @@ -262,150 +308,169 @@ def train(self, session, config, train_data, exit_criteria, valid_data, summary_ :param valid_data: The dataset instance to use for validation. :param summary_dir: Directory in which summary information will be stored. """ - summary_writer = tf.summary.FileWriter(summary_dir, session.graph) - previous_valid_log_ppx = [] - nglobal_steps = 0 - epoch = 1 - new_learning_rate = config.learning_rate - state = session.run(self.reset_state) - - try: - while True: - epoch_log_perp_unnorm = epoch_total_weights = 0.0 - print("Epoch %d Learning rate %0.3f" % (epoch, new_learning_rate)) - epoch_start_time = time.time() - # Runs each training step. A step is processing a minibatch of context-target pairs. - for step, (context, target, target_weights) in enumerate( - train_data.batch_producer_memory_efficient(self.batch_size, self.num_steps)): - # Every steps_per_checkpoint steps run validation and print perplexity/entropy. - if step % FLAGS.steps_per_checkpoint == 0: - print('Train steps:', step) - if step >0: - validation_perplexity = self.test(session, config, valid_data) - validation_log_perplexity = math.log(validation_perplexity) - print("global_steps %d learning_rate %.4f valid_perplexity %.2f" % (nglobal_steps, new_learning_rate, validation_perplexity)) - sys.stdout.flush() - feed_dict = {self.inputd: context, - self.targets: target, - self.target_weights: target_weights, - self.learning_rate: new_learning_rate, - self.keep_probability: config.keep_prob - } - if FLAGS.gru: - for i, h in enumerate(self.reset_state): - feed_dict[h] = state[i] - else: # LSTM cell - for i, (c, h) in enumerate(self.reset_state): - feed_dict[c] = state[i].c - feed_dict[h] = state[i].h - # Run the actual training step. - _, cost, state, loss, iteration = session.run([self.train_step, self.cost, self.next_state, self.loss, self.iteration], feed_dict) - nglobal_steps += 1 - # Add step loss and weights to the total. - epoch_log_perp_unnorm += np.sum(loss) - epoch_total_weights += np.sum(sum(target_weights)) - # epoch_total_weights += np.sum(sum(sub_target_weights)) - train_log_perplexity = epoch_log_perp_unnorm / epoch_total_weights - train_perplexity = math.exp(train_log_perplexity) if train_log_perplexity < 300 else float("inf") - - validation_perplexity = self.test(session, config, valid_data) - validation_log_perplexity = math.log(validation_perplexity) - # Checkpoint and save the model. - checkpoint_path = os.path.join(FLAGS.train_dir, "lm.ckpt.epoch" + str(epoch)) - self.saver.save(session, checkpoint_path, global_step=self.global_step) - - train_perplexity_summary = tf.Summary() - valid_perplexity_summary = tf.Summary() - - train_perplexity_summary.value.add(tag="train_log_ppx", simple_value=train_log_perplexity) - train_perplexity_summary.value.add(tag="train_ppx", simple_value=train_perplexity) - summary_writer.add_summary(train_perplexity_summary, nglobal_steps) - valid_perplexity_summary.value.add(tag="valid_log_ppx", simple_value=validation_log_perplexity) - valid_perplexity_summary.value.add(tag="valid_ppx", simple_value=validation_perplexity) - summary_writer.add_summary(valid_perplexity_summary, nglobal_steps) - # Convert epoch time in minutes and print info on screen. - epoch_time = (time.time() - epoch_start_time) * 1.0 / 60 - print("END EPOCH %d global_steps %d learning_rate %.4f time(mins) %.4f train_perplexity %.2f valid_perplexity %.2f" % - (epoch, nglobal_steps, new_learning_rate, epoch_time, train_perplexity, validation_perplexity)) - sys.stdout.flush() - - if exit_criteria.max_epochs is not None and epoch > exit_criteria.max_epochs: - raise StopTrainingException() - - # Decrease learning rate if valid ppx does not decrease - if len(previous_valid_log_ppx) > 1 and validation_log_perplexity >= previous_valid_log_ppx[-1]: - new_learning_rate = new_learning_rate * config.lr_decay - - # If validation perplexity has not improved over the last 5 epochs, stop training - if new_learning_rate == 0.0 or (len(previous_valid_log_ppx) > 4 and validation_log_perplexity > max(previous_valid_log_ppx[-5:])): - raise StopTrainingException() - - previous_valid_log_ppx.append(validation_log_perplexity) - epoch += 1 - except (StopTrainingException, KeyboardInterrupt): - print("Finished training ........") - - def test(self, session, config, test_data, ignore_padding=False): - """ + summary_writer = tf.compat.v1.summary.FileWriter(summary_dir, session.graph) + previous_valid_log_ppx = [] + nglobal_steps = 0 + epoch = 1 + new_learning_rate = config.learning_rate + state = session.run(self.reset_state) + + try: + while True: + epoch_log_perp_unnorm = epoch_total_weights = 0.0 + print("Epoch %d Learning rate %0.3f" % (epoch, new_learning_rate)) + epoch_start_time = time.time() + # Runs each training step. A step is processing a minibatch of context-target pairs. + for step, (context, target, target_weights) in enumerate( + train_data.batch_producer_memory_efficient(self.batch_size, self.num_steps)): + # Every steps_per_checkpoint steps run validation and print perplexity/entropy. + if step % FLAGS.steps_per_checkpoint == 0: + print('Train steps:', step) + if step > 0: + tf.print(valid_data) + validation_perplexity = self.test(session, config, valid_data) + validation_log_perplexity = math.log(validation_perplexity) + print("global_steps %d learning_rate %.4f valid_perplexity %.2f" % ( + nglobal_steps, new_learning_rate, validation_perplexity)) + # sys.stdout.flush() + feed_dict = {self.inputd: context, + self.targets: target, + self.target_weights: target_weights, + self.learning_rate: new_learning_rate, + self.keep_probability: config.keep_prob + } + if FLAGS.gru: + for i, h in enumerate(self.reset_state): + feed_dict[h] = state[i] + else: # LSTM cell + for i, (c, h) in enumerate(self.reset_state): + feed_dict[c] = state[i].c + feed_dict[h] = state[i].h + + # Run the actual training step. + _, cost, state, loss, iteration = session.run( + [self.train_step, self.cost, self.next_state, self.loss, self.iteration], feed_dict) + nglobal_steps += 1 + # Add step loss and weights to the total. + epoch_log_perp_unnorm += np.sum(loss) + epoch_total_weights += np.sum(sum(target_weights)) + # epoch_total_weights += np.sum(sum(sub_target_weights)) + train_log_perplexity = epoch_log_perp_unnorm / epoch_total_weights + train_perplexity = math.exp(train_log_perplexity) if train_log_perplexity < 300 else float("inf") + + validation_perplexity = self.test(session, config, valid_data) + validation_log_perplexity = math.log(validation_perplexity) + # Checkpoint and save the model. + checkpoint_path = os.path.join(FLAGS.train_dir, "lm.ckpt.epoch" + str(epoch)) + self.saver.save(session, checkpoint_path, global_step=self.global_step) + + train_perplexity_summary = tf.compat.v1.Summary() + valid_perplexity_summary = tf.compat.v1.Summary() + + train_perplexity_summary.value.add(tag="train_log_ppx", simple_value=train_log_perplexity) + train_perplexity_summary.value.add(tag="train_ppx", simple_value=train_perplexity) + summary_writer.add_summary(train_perplexity_summary, nglobal_steps) + valid_perplexity_summary.value.add(tag="valid_log_ppx", simple_value=validation_log_perplexity) + valid_perplexity_summary.value.add(tag="valid_ppx", simple_value=validation_perplexity) + summary_writer.add_summary(valid_perplexity_summary, nglobal_steps) + # Convert epoch time in minutes and print info on screen. + epoch_time = (time.time() - epoch_start_time) * 1.0 / 60 + print( + "END EPOCH %d global_steps %d learning_rate %.4f time(mins) %.4f train_perplexity %.2f valid_perplexity %.2f" % + (epoch, nglobal_steps, new_learning_rate, epoch_time, train_perplexity, validation_perplexity)) + # sys.stdout.flush() + + if exit_criteria.max_epochs is not None and epoch > exit_criteria.max_epochs: + raise StopTrainingException() + + # Decrease learning rate if valid ppx does not decrease + if len(previous_valid_log_ppx) > 1 and validation_log_perplexity >= previous_valid_log_ppx[-1]: + new_learning_rate = new_learning_rate * config.lr_decay + + # If validation perplexity has not improved over the last 5 epochs, stop training + if new_learning_rate == 0.0 or (len(previous_valid_log_ppx) > 4 and validation_log_perplexity > max( + previous_valid_log_ppx[-5:])): + raise StopTrainingException() + + previous_valid_log_ppx.append(validation_log_perplexity) + epoch += 1 + except (StopTrainingException, KeyboardInterrupt): + print("Finished training ........") + + def test(self, session, config, test_data, ignore_padding=False): + """ Tests the NLM with the specified configuration and test data. + + The Test Method is also used for validation of perplexity - the completion is not tested here. :param session: The TF session in which operations should be run. :param config: The configuration to be used for the model. :param test_data: :param ignore_padding: :return: """ - log_perp_unnorm, total_size = 0.0, 0.0 - batch_number = -1 - state = session.run(self.reset_state) - - for step, (context, target, target_weights, sub_target_weights) in enumerate( - test_data.batch_producer(self.batch_size, self.num_steps, True)): - batch_number += 1 - feed_dict = { - self.inputd: context, - self.targets: target, - self.target_weights: target_weights, - self.keep_probability: 1.0 # No dropout should be used for the test! - } - if FLAGS.gru: - for i, h in enumerate(self.reset_state): - feed_dict[h] = state[i] - else: - for i, (c, h) in enumerate(self.reset_state): - feed_dict[c] = state[i].c - feed_dict[h] = state[i].h - # norm_logits, loss, cost, state = session.run([self.norm_logits, self.loss, self.cost, self.next_state], feed_dict) - loss, cost, state = session.run([self.loss, self.cost, self.next_state], feed_dict) - - if FLAGS.token_model: - targets = [t for tar in target for t in tar] - voc_size = 10500000 - loss = [-math.log(1.0/voc_size, 2) if t == self.train_vocab["-UNK-"] else l - for l,t in zip(loss, targets) ] - - log_perp_unnorm += np.sum(loss) - - if FLAGS.word_level_perplexity: - total_size += np.sum(sum(sub_target_weights)) - else: - total_size += np.sum(sum(target_weights)) - - if ignore_padding: - paddings = 0 - for tok_loss, weight in zip(loss, chain.from_iterable(zip(*target_weights))): - if weight == 0: - log_perp_unnorm -= tok_loss - paddings += 1 - - total_size += 1e-12 - log_ppx = log_perp_unnorm / total_size - ppx = math.exp(float(log_ppx)) if log_ppx < 300 else float("inf") - if FLAGS.cross_entropy: - return log_ppx - return ppx - - def dynamic_train_test_file(self, test_lines, train_vocab, train_vocab_rev, test_projects, config, output_path, session): - """ + print("Starting test ... ") + log_perp_unnorm, total_size = 0.0, 0.0 + batch_number = -1 + state = session.run(self.reset_state) + + for step, (context, target, target_weights, sub_target_weights) in enumerate( + test_data.batch_producer(self.batch_size, self.num_steps, True)): + # print("Validation Batch Number",batch_number,"Batch Size ",self.batch_size) + batch_number += 1 + feed_dict = { + self.inputd: context, + self.targets: target, + self.target_weights: target_weights, + self.keep_probability: 1.0 # No dropout should be used for the test! + } + if FLAGS.gru: + for i, h in enumerate(self.reset_state): + feed_dict[h] = state[i] + else: + for i, (c, h) in enumerate(self.reset_state): + feed_dict[c] = state[i].c + feed_dict[h] = state[i].h + + loss, cost, state = session.run([self.loss, self.cost, self.next_state], feed_dict) + + if FLAGS.token_model: + targets = [t for tar in target for t in tar] + voc_size = 10500000 + loss = [-math.log(1.0 / voc_size, 2) if t == self.train_vocab["-UNK-"] else l + for l, t in zip(loss, targets)] + + log_perp_unnorm += np.sum(loss) + + if FLAGS.word_level_perplexity: + total_size += np.sum(sum(sub_target_weights)) + else: + total_size += np.sum(sum(target_weights)) + + if ignore_padding: + paddings = 0 + for tok_loss, weight in zip(loss, chain.from_iterable(zip(*target_weights))): + if weight == 0: + log_perp_unnorm -= tok_loss + paddings += 1 + + total_size += 1e-12 + + log_ppx = (log_perp_unnorm / total_size) + ppx = math.exp(float(log_ppx)) if log_ppx < 300 else float("inf") + + if FLAGS.verbose: + print("Finished Validation with") + print("Total Weight-Size in Test was ", total_size) + print("Total Error in Test was ", log_perp_unnorm) + print("Resulting in: \t", "log_ppx", log_ppx, "ppx", ppx) + + if FLAGS.cross_entropy: + return log_ppx + return ppx + + def dynamic_train_test_file(self, test_lines, train_vocab, train_vocab_rev, test_projects, config, output_path, + session): + """ Tests the NLM on the specified test dataset but also updates its parameters by training on each file after computing its per token entropy. The model is restored back to the global model after testing has been completed for a project. @@ -422,79 +487,80 @@ def dynamic_train_test_file(self, test_lines, train_vocab, train_vocab_rev, test :param session: The TF session in which operations should be run. :return: Average loss per file and not per token. """ - config.batch_size = 1 - ctr = 0 - nglobal_steps = 0 - state = None - new_learning_rate = config.learning_rate - losses = [] - lengths = [] - - last_test_project = None - for test_line, test_project in zip(test_lines, test_projects): - ctr += 1 - if test_project != last_test_project and last_test_project is not None: - # New test project so restore the model back to the global one. - ckpt = tf.train.get_checkpoint_state(FLAGS.train_dir) - if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path): - self.saver.restore(session, ckpt.model_checkpoint_path) - last_test_project = test_project - - # Get the ids for this test instance and calculate entropy/perplexity. - test_line = test_line.replace("\n", (" %s" % "-eod-")) - ids = [train_vocab[word] if word in train_vocab else train_vocab['-UNK-'] for word in test_line.split(' ')] - test_dataset = reader.dataset(ids, train_vocab, train_vocab_rev) - test_loss = self.test(session, config, test_dataset, True) - if FLAGS.cross_entropy: - print('line cross_entropy:', test_loss) - else: - print('line perplexity:', test_loss) - sys.stdout.flush() - losses.append(test_loss) - - # Train. - state = session.run(self.reset_state) - try: - epoch_log_perp_unnorm = epoch_total_weights = 0.0 - epoch_sub_total_weights = 0.0 - # Train on each batch to adapt the model to the new information available. - for step, (context, target, target_weights, sub_target_weights) in enumerate( - test_dataset.batch_producer(self.batch_size, self.num_steps)): - feed_dict = {self.inputd: context, - self.targets: target, - self.target_weights: target_weights, - self.learning_rate: new_learning_rate, - self.keep_probability: config.keep_prob - } - if FLAGS.gru: - for i, h in enumerate(self.reset_state): - feed_dict[h] = state[i] - else: # LSTM - for i, (c, h) in enumerate(self.reset_state): - feed_dict[c] = state[i].c - feed_dict[h] = state[i].h - _, cost, state, loss, iteration = session.run([self.train_step, self.cost, self.next_state, self.loss, self.iteration], feed_dict) - nglobal_steps += 1 - epoch_log_perp_unnorm += np.sum(loss) - epoch_total_weights += np.sum(sum(target_weights)) - epoch_sub_total_weights += np.sum(sum(sub_target_weights)) - train_log_perplexity = epoch_log_perp_unnorm / epoch_total_weights - train_perplexity = math.exp(train_log_perplexity) if train_log_perplexity < 300 else float("inf") - print(train_perplexity) - lengths.append(int(round(epoch_sub_total_weights, 0))) - except (StopTrainingException, KeyboardInterrupt): - print("Finished training ........") - - total_len = float(sum(lengths)) - len_weights = [length / total_len for length in lengths] - if FLAGS.cross_entropy: - print('Per token entropy:', sum([perp * weight for perp, weight in zip(losses, len_weights)])) - else: - print('Per token perplexity:', sum([perp * weight for perp, weight in zip(losses, len_weights)])) - return sum(losses)/ctr + config.batch_size = 1 + ctr = 0 + nglobal_steps = 0 + state = None + new_learning_rate = config.learning_rate + losses = [] + lengths = [] + + last_test_project = None + for test_line, test_project in zip(test_lines, test_projects): + ctr += 1 + if test_project != last_test_project and last_test_project is not None: + # New test project so restore the model back to the global one. + ckpt = tf.train.get_checkpoint_state(FLAGS.train_dir) + if ckpt and tf.compat.v1.train.checkpoint_exists(ckpt.model_checkpoint_path): + self.saver.restore(session, ckpt.model_checkpoint_path) + last_test_project = test_project + + # Get the ids for this test instance and calculate entropy/perplexity. + test_line = test_line.replace("\n", (" %s" % "-eod-")) + ids = [train_vocab[word] if word in train_vocab else train_vocab['-UNK-'] for word in test_line.split(' ')] + test_dataset = reader.dataset(ids, train_vocab, train_vocab_rev) + test_loss = self.test(session, config, test_dataset, True) + if FLAGS.cross_entropy: + print('line cross_entropy:', test_loss) + else: + print('line perplexity:', test_loss) + # sys.stdout.flush() + losses.append(test_loss) - def dynamic_train_test(self, test_lines, train_vocab, train_vocab_rev, test_projects, config, output_path, session): - """ + # Train. + state = session.run(self.reset_state) + try: + epoch_log_perp_unnorm = epoch_total_weights = 0.0 + epoch_sub_total_weights = 0.0 + # Train on each batch to adapt the model to the new information available. + for step, (context, target, target_weights, sub_target_weights) in enumerate( + test_dataset.batch_producer(self.batch_size, self.num_steps)): + feed_dict = {self.inputd: context, + self.targets: target, + self.target_weights: target_weights, + self.learning_rate: new_learning_rate, + self.keep_probability: config.keep_prob + } + if FLAGS.gru: + for i, h in enumerate(self.reset_state): + feed_dict[h] = state[i] + else: # LSTM + for i, (c, h) in enumerate(self.reset_state): + feed_dict[c] = state[i].c + feed_dict[h] = state[i].h + _, cost, state, loss, iteration = session.run( + [self.train_step, self.cost, self.next_state, self.loss, self.iteration], feed_dict) + nglobal_steps += 1 + epoch_log_perp_unnorm += np.sum(loss) + epoch_total_weights += np.sum(sum(target_weights)) + epoch_sub_total_weights += np.sum(sum(sub_target_weights)) + train_log_perplexity = epoch_log_perp_unnorm / epoch_total_weights + train_perplexity = math.exp(train_log_perplexity) if train_log_perplexity < 300 else float("inf") + print(train_perplexity) + lengths.append(int(round(epoch_sub_total_weights, 0))) + except (StopTrainingException, KeyboardInterrupt): + print("Finished training ........") + + total_len = float(sum(lengths)) + len_weights = [length / total_len for length in lengths] + if FLAGS.cross_entropy: + print('Per token entropy:', sum([perp * weight for perp, weight in zip(losses, len_weights)])) + else: + print('Per token perplexity:', sum([perp * weight for perp, weight in zip(losses, len_weights)])) + return sum(losses) / ctr + + def dynamic_train_test(self, test_lines, train_vocab, train_vocab_rev, test_projects, config, output_path, session): + """ Tests the NLM on the specified test dataset but also updates its parameters by training on each batch after computing its per token entropy first. The model is restored back to the global model after testing has been completed for a project. @@ -511,119 +577,120 @@ def dynamic_train_test(self, test_lines, train_vocab, train_vocab_rev, test_proj :param session: :return: Average loss per file and not per token. """ - config.batch_size = 1 - ctr = 0 - nglobal_steps = 0 - state = None - new_learning_rate = config.learning_rate - losses = [] - lengths = [] - - last_test_project = None - for test_line, test_project in zip(test_lines, test_projects): - ctr += 1 - if ctr % 100 == 0: - print("\t %d lines" % ctr) - total_len = float(sum(lengths)) - len_weights = [length / total_len for length in lengths] - print('Current per token :', sum([perp * weight for perp, weight in zip(losses, len_weights)])) - print(sum(losses) / ctr) + config.batch_size = 1 + ctr = 0 + nglobal_steps = 0 + state = None + new_learning_rate = config.learning_rate + losses = [] + lengths = [] + + last_test_project = None + for test_line, test_project in zip(test_lines, test_projects): + ctr += 1 + if ctr % 100 == 0: + print("\t %d lines" % ctr) + total_len = float(sum(lengths)) + len_weights = [length / total_len for length in lengths] + print('Current per token :', sum([perp * weight for perp, weight in zip(losses, len_weights)])) + print(sum(losses) / ctr) + + file_log_perp_unnorm = file_total_weights = 0.0 + file_sub_total_weights = 0.0 + + if test_project != last_test_project and last_test_project is not None: + # New test project so restore the model + ckpt = tf.train.get_checkpoint_state(FLAGS.train_dir) + if ckpt and tf.compat.v1.train.checkpoint_exists(ckpt.model_checkpoint_path): + self.saver.restore(session, ckpt.model_checkpoint_path) + last_test_project = test_project + + test_line = test_line.replace("\n", (" %s" % "-eod-")) + ids = [train_vocab[word] if word in train_vocab else train_vocab['-UNK-'] for word in test_line.split(' ')] + test_dataset = reader.dataset(ids, train_vocab, train_vocab_rev) + + # Test, Train + state = session.run(self.reset_state) + try: + epoch_log_perp_unnorm = epoch_total_weights = 0.0 + epoch_sub_total_weights = 0.0 + for step, (context, target, target_weights, sub_target_weights) in enumerate( + test_dataset.batch_producer(self.batch_size, self.num_steps)): + feed_dict = {self.inputd: context, + self.targets: target, + self.target_weights: target_weights, + self.keep_probability: 1.0 + } + if FLAGS.gru: + for i, h in enumerate(self.reset_state): + feed_dict[h] = state[i] + else: # LSTM + for i, (c, h) in enumerate(self.reset_state): + feed_dict[c] = state[i].c + feed_dict[h] = state[i].h + loss, cost, state = session.run([self.loss, self.cost, self.next_state], feed_dict) + if FLAGS.token_model: + targets = [t for tar in target for t in tar] + loss = [-math.log(1.0 / len(self.train_vocab), 2) if t == self.train_vocab["-UNK-"] else l + for l, t in zip(loss, targets)] + file_log_perp_unnorm += np.sum(loss) + file_total_weights += np.sum(sum(target_weights)) + file_sub_total_weights += np.sum(sum(sub_target_weights)) + if True: + for tok_loss, weight in zip(loss, chain.from_iterable(zip(*target_weights))): + if weight == 0: + file_log_perp_unnorm -= tok_loss + + feed_dict = {self.inputd: context, + self.targets: target, + self.target_weights: target_weights, + self.learning_rate: new_learning_rate, + self.keep_probability: config.keep_prob + } + if FLAGS.gru: + for i, h in enumerate(self.reset_state): + feed_dict[h] = state[i] + else: # LSTM + for i, (c, h) in enumerate(self.reset_state): + feed_dict[c] = state[i].c + feed_dict[h] = state[i].h + _, cost, state, loss, iteration = session.run( + [self.train_step, self.cost, self.next_state, self.loss, self.iteration], feed_dict) + nglobal_steps += 1 + epoch_log_perp_unnorm += np.sum(loss) + epoch_total_weights += np.sum(sum(target_weights)) + epoch_sub_total_weights += np.sum(sum(sub_target_weights)) + + train_log_perplexity = epoch_log_perp_unnorm / epoch_total_weights + train_perplexity = math.exp(train_log_perplexity) if train_log_perplexity < 300 else float("inf") + print(train_perplexity) + lengths.append(int(round(epoch_sub_total_weights, 0))) + + if FLAGS.word_level_perplexity: + test_loss = file_log_perp_unnorm / file_sub_total_weights + else: + test_loss = file_log_perp_unnorm / file_total_weights - file_log_perp_unnorm = file_total_weights = 0.0 - file_sub_total_weights = 0.0 + if FLAGS.cross_entropy: + print('line cross_entropy:', test_loss) + else: + print('line perplexity:', test_loss) + # sys.stdout.flush() + losses.append(test_loss) - if test_project != last_test_project and last_test_project is not None: - # New test project so restore the model - ckpt = tf.train.get_checkpoint_state(FLAGS.train_dir) - if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path): - self.saver.restore(session, ckpt.model_checkpoint_path) - last_test_project = test_project - - test_line = test_line.replace("\n", (" %s" % "-eod-")) - ids = [train_vocab[word] if word in train_vocab else train_vocab['-UNK-'] for word in test_line.split(' ')] - test_dataset = reader.dataset(ids, train_vocab, train_vocab_rev) - - # Test, Train - state = session.run(self.reset_state) - try: - epoch_log_perp_unnorm = epoch_total_weights = 0.0 - epoch_sub_total_weights = 0.0 - for step, (context, target, target_weights, sub_target_weights) in enumerate( - test_dataset.batch_producer(self.batch_size, self.num_steps)): - feed_dict = {self.inputd: context, - self.targets: target, - self.target_weights: target_weights, - self.keep_probability: 1.0 - } - if FLAGS.gru: - for i, h in enumerate(self.reset_state): - feed_dict[h] = state[i] - else: # LSTM - for i, (c, h) in enumerate(self.reset_state): - feed_dict[c] = state[i].c - feed_dict[h] = state[i].h - loss, cost, state = session.run([self.loss, self.cost, self.next_state], feed_dict) - if FLAGS.token_model: - targets = [t for tar in target for t in tar] - loss = [-math.log(1.0/len(self.train_vocab), 2) if t == self.train_vocab["-UNK-"] else l - for l,t in zip(loss, targets) ] - file_log_perp_unnorm += np.sum(loss) - file_total_weights += np.sum(sum(target_weights)) - file_sub_total_weights += np.sum(sum(sub_target_weights)) - if True: - for tok_loss, weight in zip(loss, chain.from_iterable(zip(*target_weights))): - if weight == 0: - file_log_perp_unnorm -= tok_loss - - feed_dict = {self.inputd: context, - self.targets: target, - self.target_weights: target_weights, - self.learning_rate: new_learning_rate, - self.keep_probability: config.keep_prob - } - if FLAGS.gru: - for i, h in enumerate(self.reset_state): - feed_dict[h] = state[i] - else: # LSTM - for i, (c, h) in enumerate(self.reset_state): - feed_dict[c] = state[i].c - feed_dict[h] = state[i].h - _, cost, state, loss, iteration = session.run([self.train_step, self.cost, self.next_state, self.loss, self.iteration], feed_dict) - nglobal_steps += 1 - epoch_log_perp_unnorm += np.sum(loss) - epoch_total_weights += np.sum(sum(target_weights)) - epoch_sub_total_weights += np.sum(sum(sub_target_weights)) - - train_log_perplexity = epoch_log_perp_unnorm / epoch_total_weights - train_perplexity = math.exp(train_log_perplexity) if train_log_perplexity < 300 else float("inf") - print(train_perplexity) - lengths.append(int(round(epoch_sub_total_weights, 0))) - - if FLAGS.word_level_perplexity: - test_loss = file_log_perp_unnorm / file_sub_total_weights - else: - test_loss = file_log_perp_unnorm / file_total_weights + except (StopTrainingException, KeyboardInterrupt): + print("Finished training ........") + total_len = float(sum(lengths)) + len_weights = [length / total_len for length in lengths] if FLAGS.cross_entropy: - print('line cross_entropy:', test_loss) + print('Per token entropy:', sum([perp * weight for perp, weight in zip(losses, len_weights)])) else: - print('line perplexity:', test_loss) - sys.stdout.flush() - losses.append(test_loss) - - except (StopTrainingException, KeyboardInterrupt): - print("Finished training ........") - - total_len = float(sum(lengths)) - len_weights = [length / total_len for length in lengths] - if FLAGS.cross_entropy: - print('Per token entropy:', sum([perp * weight for perp, weight in zip(losses, len_weights)])) - else: - print('Per token perplexity:', sum([perp * weight for perp, weight in zip(losses, len_weights)])) - return sum(losses)/ctr + print('Per token perplexity:', sum([perp * weight for perp, weight in zip(losses, len_weights)])) + return sum(losses) / ctr - def maintenance_test(self, session, config, test_lines, test_projects, train_vocab, train_vocab_rev): - """ + def maintenance_test(self, session, config, test_lines, test_projects, train_vocab, train_vocab_rev): + """ Simulates code maintenance scenario. For each file in a project the model is first adapted on the rest of the files. The model is also adapted on encountered sequences of the test file. @@ -637,300 +704,305 @@ def maintenance_test(self, session, config, test_lines, test_projects, train_voc :param train_vocab_rev: The id to word mapping. :return: """ - # If checkpoint does not exist throw an exception. A global model must have been pretrained. - ckpt = tf.train.get_checkpoint_state(FLAGS.train_dir) - if not tf.train.checkpoint_exists(ckpt.model_checkpoint_path): - raise Exception('Checkpoint does not exist!') - - # Some initializations. - new_learning_rate = config.learning_rate - test_losses = [] - test_losses_sum = 0.0 - lengths = [] - ctr = 0 - - # First compute which files belong in each project. - project_sizes = [] - last_project_name = '' - project_file_size = 0 - for test_project in test_projects: - if test_project != last_project_name: - if last_project_name != '': - project_sizes.append(project_file_size) - project_file_size = 0 - else: - project_file_size += 1 - last_project_name = test_project - print(project_sizes) - print(sum(project_sizes)) - print() - - # Now distribute test lines based on project size. - project_test_lines = [] - files_distributed = 0 - for project_file_size in project_sizes: - project_test_lines.append(test_lines[files_distributed : files_distributed + project_file_size]) - files_distributed += project_file_size - - partitions = 20 # Default number of partitions - for proj_id, test_lines in enumerate(project_test_lines): - print(len(test_lines)) - large_project = len(test_lines) > 200 - if len(test_lines) > 2000: # Really big projects can have speed benefits from more partitions. - partitions = 50 - else: - partitions = 20 - - if large_project: - # The project is very large so partition it in partition_size parts. - # For each of the 20/50 parts train a model on all the parts excluding itself. - partition_size = len(test_lines) / partitions - for partition in range(partitions): - partition_words = [] - for i, line in enumerate(test_lines): - if i / partition_size != partition and not (i / partition_size == partitions + 1 and partition == partitions): - partition_words.extend([word for word in line.split(' ')]) - # If checkpoint does not exist throw an exception. A global model should have been trained... - ckpt = tf.train.get_checkpoint_state(FLAGS.train_dir) - if not tf.train.checkpoint_exists(ckpt.model_checkpoint_path): - raise Exception('Checkpoint for global model does not exist!') - self.saver.restore(session, ckpt.model_checkpoint_path) - - # Where to save this partition. - partition_path = os.path.join(FLAGS.train_dir, "partition%d" % partition) - if os.path.isdir(partition_path): shutil.rmtree(partition_path) - - partition_ids = [train_vocab[word] if word in train_vocab else train_vocab['-UNK-'] for word in partition_words] - partition_dataset = reader.dataset(partition_ids, train_vocab, train_vocab_rev) - - # Reset the LSTM state to zeros and train - state = session.run(self.reset_state) - try: - epoch_log_perp_unnorm = epoch_total_weights = 0.0 - epoch_sub_total_weights = 0.0 - for step, (context, target, target_weights, sub_target_weights) in enumerate( - partition_dataset.batch_producer(self.batch_size, self.num_steps)): - feed_dict = {self.inputd: context, - self.targets: target, - self.target_weights: target_weights, - self.learning_rate: new_learning_rate, - self.keep_probability: config.keep_prob - } - if FLAGS.gru: - for i, h in enumerate(self.reset_state): - feed_dict[h] = state[i] - else: - for i, (c, h) in enumerate(self.reset_state): - feed_dict[c] = state[i].c - feed_dict[h] = state[i].h - _, cost, state, loss, iteration = session.run( - [self.train_step, self.cost, self.next_state, self.loss, self.iteration], feed_dict) - epoch_log_perp_unnorm += np.sum(loss) - epoch_total_weights += np.sum(sum(target_weights)) - epoch_sub_total_weights += np.sum(sum(sub_target_weights)) - - train_log_perplexity = epoch_log_perp_unnorm / epoch_total_weights - train_perplexity = math.exp(train_log_perplexity) if train_log_perplexity < 300 else float("inf") - print(train_perplexity) - checkpoint_path = os.path.join(partition_path, "model") - self.saver.save(session, checkpoint_path, global_step=self.global_step) - except (StopTrainingException, KeyboardInterrupt): - print("Finished training ........") - - for lines_done, test_line in enumerate(test_lines): - ctr += 1 - # Restore the global model + # If checkpoint does not exist throw an exception. A global model must have been pretrained. ckpt = tf.train.get_checkpoint_state(FLAGS.train_dir) - if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path): - self.saver.restore(session, ckpt.model_checkpoint_path) - - if large_project: - # Load pretrained model and only train on the rest of the files from this partition - partition = lines_done / partition_size - partition = min(partitions - 1, partition) # last partition can be bigger - partition_path = os.path.join(FLAGS.train_dir, "partition%d" % partition) - part_ckpt = tf.train.get_checkpoint_state(partition_path) - self.saver.restore(session, part_ckpt.model_checkpoint_path) - - train_words = [] - if partition < partitions - 1: - partition_lines = zip(range(partition * partition_size, (partition + 1) * partition_size), - test_lines[partition * partition_size : (partition + 1) * partition_size]) - else: - partition_lines = zip(range(partition * partition_size, len(test_lines)), - test_lines[partition * partition_size : (partition + 1) * partition_size]) - for i, line in partition_lines: - if i != (lines_done % partition_size): - train_words.extend([word for word in line.split(' ')]) - else: - # Now for each file in the current test project use the rest as train data - train_words = [] - for i, line in enumerate(test_lines): - if i != lines_done: - train_words.extend([word for word in line.split(' ')]) - - # Convert the train data words to ids - ids = [train_vocab[word] if word in train_vocab else train_vocab['-UNK-'] for word in train_words] - train_dataset = reader.dataset(ids, train_vocab, train_vocab_rev) + if not tf.compat.v1.train.checkpoint_exists(ckpt.model_checkpoint_path): + raise Exception('Checkpoint does not exist!') - # Reset the LSTM state to zeros and train - state = session.run(self.reset_state) - try: - epoch_log_perp_unnorm = epoch_total_weights = 0.0 - epoch_sub_total_weights = 0.0 - for step, (context, target, target_weights, sub_target_weights) in enumerate( - train_dataset.batch_producer(self.batch_size, self.num_steps)): - feed_dict = {self.inputd: context, - self.targets: target, - self.target_weights: target_weights, - self.learning_rate: new_learning_rate, - self.keep_probability: config.keep_prob - } - if FLAGS.gru: - for i, h in enumerate(self.reset_state): - feed_dict[h] = state[i] + # Some initializations. + new_learning_rate = config.learning_rate + test_losses = [] + test_losses_sum = 0.0 + lengths = [] + ctr = 0 + + # First compute which files belong in each project. + project_sizes = [] + last_project_name = '' + project_file_size = 0 + for test_project in test_projects: + if test_project != last_project_name: + if last_project_name != '': + project_sizes.append(project_file_size) + project_file_size = 0 else: - for i, (c, h) in enumerate(self.reset_state): - feed_dict[c] = state[i].c - feed_dict[h] = state[i].h - # print("state number " + str(i)) - _, cost, state, loss, iteration = session.run([self.train_step, self.cost, self.next_state, self.loss, self.iteration], feed_dict) - epoch_log_perp_unnorm += np.sum(loss) - epoch_total_weights += np.sum(sum(target_weights)) - epoch_sub_total_weights += np.sum(sum(sub_target_weights)) - train_log_perplexity = epoch_log_perp_unnorm / epoch_total_weights - train_perplexity = math.exp(train_log_perplexity) if train_log_perplexity < 300 else float("inf") - # lengths.append(int(round(epoch_sub_total_weights, 0))) - - # Training done. Now test on test file - ids = [train_vocab[word] if word in train_vocab else train_vocab['-UNK-'] - for word in test_lines[lines_done].split(' ')] - # Test on each sequence of tokens and then train on it until the file is done - subtokens_done = 0 - tokens_done = 0 - instance_losses = [] - # Reset the LSTM state to zeros and train - state = session.run(self.reset_state) - test_state = session.run(self.reset_state) - while subtokens_done + config.num_steps < len(ids): - step_end = subtokens_done + config.num_steps - unfinished_token = train_vocab_rev[ids[step_end]].endswith('@@') - while unfinished_token: - step_end += 1 - unfinished_token = train_vocab_rev[ids[step_end]].endswith('@@') - - try: - context = ids[subtokens_done : step_end] - target = ids[subtokens_done + 1 : step_end + 1] - target_weights = [1] * len(context) - for id in context: - if not train_vocab_rev[id].endswith('@@'): - tokens_done += 1 - - feed_dict = {self.inputd: np.tile(context, (self.batch_size, 1)), - self.targets: np.tile(target, (self.batch_size, 1)), - self.target_weights: np.tile(target_weights, (self.batch_size, 1)), - self.keep_probability: 1.0 - } - if FLAGS.gru: - for i, h in enumerate(self.reset_state): - feed_dict[h] = test_state[i] - else: - for i, (c, h) in enumerate(self.reset_state): - feed_dict[c] = test_state[i].c - feed_dict[h] = test_state[i].h - cost, test_state, loss = session.run([self.cost, self.next_state, self.loss], feed_dict) - if tokens_done > 0: - instance_losses.append((tokens_done, len(context), sum(loss)/tokens_done)) - feed_dict = {self.inputd: np.tile(context, (self.batch_size, 1)), - self.targets: np.tile(target, (self.batch_size, 1)), - self.target_weights: np.tile(target_weights, (self.batch_size, 1)), - self.learning_rate: new_learning_rate, - self.keep_probability: config.keep_prob - } - if FLAGS.gru: - for i, h in enumerate(self.reset_state): - feed_dict[h] = state[i] - else: # LSTM - for i, (c, h) in enumerate(self.reset_state): - feed_dict[c] = state[i].c - feed_dict[h] = state[i].h - _, cost, state, loss, iteration = session.run( - [self.train_step, self.cost, self.next_state, self.loss, self.iteration], feed_dict) - except (StopTrainingException, KeyboardInterrupt): - print("Finished training ........") - subtokens_done = step_end - - # Test and train on leftover part of the sequence, which has length < step_size. - try: - tokens_done = 0 - context = ids[subtokens_done:-1] - target = ids[subtokens_done + 1:] - target_weights = [1] * len(context) - if len(context) == 0 or len(target) == 0: # if there are no actual leftovers stop. - continue - - for id in context: - if not train_vocab_rev[id].endswith('@@'): - tokens_done += 1 - feed_dict = {self.inputd: np.tile(context, (self.batch_size, 1)), - self.targets: np.tile(target, (self.batch_size, 1)), - self.target_weights: np.tile(target_weights, (self.batch_size, 1)), - self.keep_probability: 1.0 - } - if FLAGS.gru: - for i, h in enumerate(self.reset_state): - feed_dict[h] = test_state[i] + project_file_size += 1 + last_project_name = test_project + print(project_sizes) + print(sum(project_sizes)) + print() + + # Now distribute test lines based on project size. + project_test_lines = [] + files_distributed = 0 + for project_file_size in project_sizes: + project_test_lines.append(test_lines[files_distributed: files_distributed + project_file_size]) + files_distributed += project_file_size + + partitions = 20 # Default number of partitions + for proj_id, test_lines in enumerate(project_test_lines): + print(len(test_lines)) + large_project = len(test_lines) > 200 + if len(test_lines) > 2000: # Really big projects can have speed benefits from more partitions. + partitions = 50 else: - for i, (c, h) in enumerate(self.reset_state): - feed_dict[c] = test_state[i].c - feed_dict[h] = test_state[i].h - cost, test_state, loss = session.run([self.cost, self.next_state, self.loss], feed_dict) - if tokens_done > 0: - instance_losses.append((tokens_done, len(context), sum(loss)/tokens_done)) - feed_dict = {self.inputd: np.tile(context, (self.batch_size, 1)), - self.targets: np.tile(target, (self.batch_size, 1)), - self.target_weights: np.tile(target_weights, (self.batch_size, 1)), - self.learning_rate: new_learning_rate, - self.keep_probability: config.keep_prob - } - if FLAGS.gru: - for i, h in enumerate(self.reset_state): - feed_dict[h] = state[i] - else: # LSTM - for i, (c, h) in enumerate(self.reset_state): - feed_dict[c] = state[i].c - feed_dict[h] = state[i].h - # print("state number " + str(i)) - _, cost, state, loss, iteration = session.run( - [self.train_step, self.cost, self.next_state, self.loss, self.iteration], feed_dict) - except (StopTrainingException, KeyboardInterrupt): - print("Finished training ........") - - tokens_done = float(sum([toks for toks, _, _ in instance_losses])) - lengths.append(tokens_done) - weighted_losses = [loss * toks / tokens_done for toks, _, loss in instance_losses] - test_loss = sum(weighted_losses) - test_losses_sum += test_loss - print('line cross_entropy:', test_loss, '--- current average:', test_losses_sum / ctr) - # if FLAGS.cross_entropy: - # print('line cross_entropy:', test_loss, '--- current average:', test_losses_sum / ctr) - # else: - # print('line perplexity:', test_loss) - sys.stdout.flush() - test_losses.append(test_loss) - except (StopTrainingException, KeyboardInterrupt): - print("Finished training ........") - - print("Projects Done:", proj_id + 1) + partitions = 20 + + if large_project: + # The project is very large so partition it in partition_size parts. + # For each of the 20/50 parts train a model on all the parts excluding itself. + partition_size = len(test_lines) / partitions + for partition in range(partitions): + partition_words = [] + for i, line in enumerate(test_lines): + if i / partition_size != partition and not ( + i / partition_size == partitions + 1 and partition == partitions): + partition_words.extend([word for word in line.split(' ')]) + # If checkpoint does not exist throw an exception. A global model should have been trained... + ckpt = tf.train.get_checkpoint_state(FLAGS.train_dir) + if not tf.compat.v1.train.checkpoint_exists(ckpt.model_checkpoint_path): + raise Exception('Checkpoint for global model does not exist!') + self.saver.restore(session, ckpt.model_checkpoint_path) + + # Where to save this partition. + partition_path = os.path.join(FLAGS.train_dir, "partition%d" % partition) + if os.path.isdir(partition_path): shutil.rmtree(partition_path) + + partition_ids = [train_vocab[word] if word in train_vocab else train_vocab['-UNK-'] for word in + partition_words] + partition_dataset = reader.dataset(partition_ids, train_vocab, train_vocab_rev) + + # Reset the LSTM state to zeros and train + state = session.run(self.reset_state) + try: + epoch_log_perp_unnorm = epoch_total_weights = 0.0 + epoch_sub_total_weights = 0.0 + for step, (context, target, target_weights, sub_target_weights) in enumerate( + partition_dataset.batch_producer(self.batch_size, self.num_steps)): + feed_dict = {self.inputd: context, + self.targets: target, + self.target_weights: target_weights, + self.learning_rate: new_learning_rate, + self.keep_probability: config.keep_prob + } + if FLAGS.gru: + for i, h in enumerate(self.reset_state): + feed_dict[h] = state[i] + else: + for i, (c, h) in enumerate(self.reset_state): + feed_dict[c] = state[i].c + feed_dict[h] = state[i].h + _, cost, state, loss, iteration = session.run( + [self.train_step, self.cost, self.next_state, self.loss, self.iteration], feed_dict) + epoch_log_perp_unnorm += np.sum(loss) + epoch_total_weights += np.sum(sum(target_weights)) + epoch_sub_total_weights += np.sum(sum(sub_target_weights)) + + train_log_perplexity = epoch_log_perp_unnorm / epoch_total_weights + train_perplexity = math.exp(train_log_perplexity) if train_log_perplexity < 300 else float( + "inf") + print(train_perplexity) + checkpoint_path = os.path.join(partition_path, "model") + self.saver.save(session, checkpoint_path, global_step=self.global_step) + except (StopTrainingException, KeyboardInterrupt): + print("Finished training ........") + + for lines_done, test_line in enumerate(test_lines): + ctr += 1 + # Restore the global model + ckpt = tf.train.get_checkpoint_state(FLAGS.train_dir) + if ckpt and tf.compat.v1.train.checkpoint_exists(ckpt.model_checkpoint_path): + self.saver.restore(session, ckpt.model_checkpoint_path) + + if large_project: + # Load pretrained model and only train on the rest of the files from this partition + partition = lines_done / partition_size + partition = min(partitions - 1, partition) # last partition can be bigger + partition_path = os.path.join(FLAGS.train_dir, "partition%d" % partition) + part_ckpt = tf.train.get_checkpoint_state(partition_path) + self.saver.restore(session, part_ckpt.model_checkpoint_path) + + train_words = [] + if partition < partitions - 1: + partition_lines = zip(range(partition * partition_size, (partition + 1) * partition_size), + test_lines[partition * partition_size: (partition + 1) * partition_size]) + else: + partition_lines = zip(range(partition * partition_size, len(test_lines)), + test_lines[partition * partition_size: (partition + 1) * partition_size]) + for i, line in partition_lines: + if i != (lines_done % partition_size): + train_words.extend([word for word in line.split(' ')]) + else: + # Now for each file in the current test project use the rest as train data + train_words = [] + for i, line in enumerate(test_lines): + if i != lines_done: + train_words.extend([word for word in line.split(' ')]) + + # Convert the train data words to ids + ids = [train_vocab[word] if word in train_vocab else train_vocab['-UNK-'] for word in train_words] + train_dataset = reader.dataset(ids, train_vocab, train_vocab_rev) + + # Reset the LSTM state to zeros and train + state = session.run(self.reset_state) + try: + epoch_log_perp_unnorm = epoch_total_weights = 0.0 + epoch_sub_total_weights = 0.0 + for step, (context, target, target_weights, sub_target_weights) in enumerate( + train_dataset.batch_producer(self.batch_size, self.num_steps)): + feed_dict = {self.inputd: context, + self.targets: target, + self.target_weights: target_weights, + self.learning_rate: new_learning_rate, + self.keep_probability: config.keep_prob + } + if FLAGS.gru: + for i, h in enumerate(self.reset_state): + feed_dict[h] = state[i] + else: + for i, (c, h) in enumerate(self.reset_state): + feed_dict[c] = state[i].c + feed_dict[h] = state[i].h + # print("state number " + str(i)) + _, cost, state, loss, iteration = session.run( + [self.train_step, self.cost, self.next_state, self.loss, self.iteration], feed_dict) + epoch_log_perp_unnorm += np.sum(loss) + epoch_total_weights += np.sum(sum(target_weights)) + epoch_sub_total_weights += np.sum(sum(sub_target_weights)) + train_log_perplexity = epoch_log_perp_unnorm / epoch_total_weights + train_perplexity = math.exp(train_log_perplexity) if train_log_perplexity < 300 else float("inf") + # lengths.append(int(round(epoch_sub_total_weights, 0))) + + # Training done. Now test on test file + ids = [train_vocab[word] if word in train_vocab else train_vocab['-UNK-'] + for word in test_lines[lines_done].split(' ')] + # Test on each sequence of tokens and then train on it until the file is done + subtokens_done = 0 + tokens_done = 0 + instance_losses = [] + # Reset the LSTM state to zeros and train + state = session.run(self.reset_state) + test_state = session.run(self.reset_state) + while subtokens_done + config.num_steps < len(ids): + step_end = subtokens_done + config.num_steps + unfinished_token = train_vocab_rev[ids[step_end]].endswith('@@') + while unfinished_token: + step_end += 1 + unfinished_token = train_vocab_rev[ids[step_end]].endswith('@@') + + try: + context = ids[subtokens_done: step_end] + target = ids[subtokens_done + 1: step_end + 1] + target_weights = [1] * len(context) + for id in context: + if not train_vocab_rev[id].endswith('@@'): + tokens_done += 1 + + feed_dict = {self.inputd: np.tile(context, (self.batch_size, 1)), + self.targets: np.tile(target, (self.batch_size, 1)), + self.target_weights: np.tile(target_weights, (self.batch_size, 1)), + self.keep_probability: 1.0 + } + if FLAGS.gru: + for i, h in enumerate(self.reset_state): + feed_dict[h] = test_state[i] + else: + for i, (c, h) in enumerate(self.reset_state): + feed_dict[c] = test_state[i].c + feed_dict[h] = test_state[i].h + cost, test_state, loss = session.run([self.cost, self.next_state, self.loss], feed_dict) + if tokens_done > 0: + instance_losses.append((tokens_done, len(context), sum(loss) / tokens_done)) + feed_dict = {self.inputd: np.tile(context, (self.batch_size, 1)), + self.targets: np.tile(target, (self.batch_size, 1)), + self.target_weights: np.tile(target_weights, (self.batch_size, 1)), + self.learning_rate: new_learning_rate, + self.keep_probability: config.keep_prob + } + if FLAGS.gru: + for i, h in enumerate(self.reset_state): + feed_dict[h] = state[i] + else: # LSTM + for i, (c, h) in enumerate(self.reset_state): + feed_dict[c] = state[i].c + feed_dict[h] = state[i].h + _, cost, state, loss, iteration = session.run( + [self.train_step, self.cost, self.next_state, self.loss, self.iteration], feed_dict) + except (StopTrainingException, KeyboardInterrupt): + print("Finished training ........") + subtokens_done = step_end + + # Test and train on leftover part of the sequence, which has length < step_size. + try: + tokens_done = 0 + context = ids[subtokens_done:-1] + target = ids[subtokens_done + 1:] + target_weights = [1] * len(context) + if len(context) == 0 or len(target) == 0: # if there are no actual leftovers stop. + continue + + for id in context: + if not train_vocab_rev[id].endswith('@@'): + tokens_done += 1 + feed_dict = {self.inputd: np.tile(context, (self.batch_size, 1)), + self.targets: np.tile(target, (self.batch_size, 1)), + self.target_weights: np.tile(target_weights, (self.batch_size, 1)), + self.keep_probability: 1.0 + } + if FLAGS.gru: + for i, h in enumerate(self.reset_state): + feed_dict[h] = test_state[i] + else: + for i, (c, h) in enumerate(self.reset_state): + feed_dict[c] = test_state[i].c + feed_dict[h] = test_state[i].h + cost, test_state, loss = session.run([self.cost, self.next_state, self.loss], feed_dict) + if tokens_done > 0: + instance_losses.append((tokens_done, len(context), sum(loss) / tokens_done)) + feed_dict = {self.inputd: np.tile(context, (self.batch_size, 1)), + self.targets: np.tile(target, (self.batch_size, 1)), + self.target_weights: np.tile(target_weights, (self.batch_size, 1)), + self.learning_rate: new_learning_rate, + self.keep_probability: config.keep_prob + } + if FLAGS.gru: + for i, h in enumerate(self.reset_state): + feed_dict[h] = state[i] + else: # LSTM + for i, (c, h) in enumerate(self.reset_state): + feed_dict[c] = state[i].c + feed_dict[h] = state[i].h + # print("state number " + str(i)) + _, cost, state, loss, iteration = session.run( + [self.train_step, self.cost, self.next_state, self.loss, self.iteration], feed_dict) + except (StopTrainingException, KeyboardInterrupt): + print("Finished training ........") + + tokens_done = float(sum([toks for toks, _, _ in instance_losses])) + lengths.append(tokens_done) + weighted_losses = [loss * toks / tokens_done for toks, _, loss in instance_losses] + test_loss = sum(weighted_losses) + test_losses_sum += test_loss + print('line cross_entropy:', test_loss, '--- current average:', test_losses_sum / ctr) + # if FLAGS.cross_entropy: + # print('line cross_entropy:', test_loss, '--- current average:', test_losses_sum / ctr) + # else: + # print('line perplexity:', test_loss) + # sys.stdout.flush() + test_losses.append(test_loss) + except (StopTrainingException, KeyboardInterrupt): + print("Finished training ........") + + print("Projects Done:", proj_id + 1) - total_len = float(sum(lengths)) - len_weights = [length / total_len for length in lengths] - print('Is it correct perplexity:', sum([perp * weight for perp, weight in zip(test_losses, len_weights)])) - return test_losses_sum / ctr + total_len = float(sum(lengths)) + len_weights = [length / total_len for length in lengths] + print('Is it correct perplexity:', sum([perp * weight for perp, weight in zip(test_losses, len_weights)])) + return test_losses_sum / ctr - def completion(self, session, config, test_dataset, test_projects, beam_size, dynamic=False, id_map=None, cache_ids=False, token_map=None): - """ + def completion(self, session, config, test_dataset, test_projects, beam_size, dynamic=False, id_map=None, + cache_ids=False, token_map=None): + """ Runs code the code completion scenario. Dynamic update can be performed but by default is turned off. :param session: The TF session in which operations should be run. :param config: The configuration to be used for the model. @@ -940,553 +1012,570 @@ def completion(self, session, config, test_dataset, test_projects, beam_size, dy :param dynamic: Whether dynamic adaptation should be performed. :return: """ - mrr = 0.0 - id_mrr = 0.0 - id_acc1 = 0.0 - id_acc3 = 0.0 - id_acc5 = 0.0 - id_acc10 = 0.0 - - ids_in_cache = 0.0 - ids_in_project_cache = 0.0 - context_history_in_ngram_cache = 0.0 - context_history_in_ngram_project_cache = 0.0 - - satisfaction_prob = 0.8 - top_needed = 10 - verbose = FLAGS.verbose - last_test_project = None - train_every = config.num_steps - tokens_done = 0 - files_done = 0 - identifiers = 0 - file_identifiers = 0 - state = session.run(self.reset_state) - - context_size = FLAGS.cache_order - 1 - context_history = deque([None] * context_size, context_size) - ngram_cache = dict() - ngram_project_cache = dict() - # project_context_history = [] - id_cache = trie.CharTrie() - project_id_cache = trie.CharTrie() - CACHE_WEIGHT = FLAGS.file_cache_weight - PROJECT_CACHE_WEIGHT = FLAGS.file_cache_weight * 0.5 - SKIP_CACHE_PROB_THRESHOLD = 0.0 - - raw_data = test_dataset.data # is just one long array - data_len = len(raw_data) - print('Data Length:', data_len) - data_covered = 0 - end_file_id = test_dataset.vocab["-eod-"] - start_index = 0 - file_start_index = 0 - while data_covered < data_len: - # Stop when 1000000 test tokens have been scored. - if tokens_done > 1000000: - break - - # Create minibatches for the next file - while raw_data[data_covered] != end_file_id: - data_covered += 1 - data_covered += 1 # eod symbol - file_identifiers = 0 - - # Reset identifier cache for each file. - if cache_ids: - id_cache.clear() - ids_in_cache = 0.0 - ids_in_project_cache = 0.0 - context_history = deque([None] * context_size, context_size) - ngram_cache = dict() - context_history_in_ngram_cache = 0.0 - context_history_in_ngram_project_cache = 0.0 - - if dynamic: - test_project = test_projects[files_done] - if test_project != last_test_project: - # New test project so restore the model - ckpt = tf.train.get_checkpoint_state(FLAGS.train_dir) - if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path): - self.saver.restore(session, ckpt.model_checkpoint_path) - - # Reset the project's cache of identifiers if one is used. - if cache_ids: - print('clearing project cache') - ids_in_project_cache = 0.0 - project_id_cache.clear() - ngram_project_cache = dict() - last_test_project = test_project - - file_data = raw_data[file_start_index:data_covered] - file_start_index = data_covered - print('Completion Length:', len(file_data)) - - if not id_map is None: file_ids = id_map[files_done] + [0] - else: file_ids = [0] * (len(file_data) - 1) - - if not token_map is None: file_tokens = token_map[files_done] - else: file_tokens = [0] * (len(file_data) - 1) - - tokens_before = deque([None, test_dataset.rev_vocab[file_data[0]]], 2) - - state = session.run(self.reset_state) - remember_state = state - train_state = session.run(self.reset_state) - in_token = False - - correct_token = '' - train_start = 0 - train_end = 0 - # to_add = [] - for subtoken_id, context_target_is_id in enumerate(zip(file_data[:-1], file_data[1:], file_ids)): - context, target, is_id = context_target_is_id - # print(test_dataset.rev_vocab[context], test_dataset.rev_vocab[target], is_id) - train_end += 1 - - # to_add.append(test_dataset.rev_vocab[context]) - if (subtoken_id - 1) - train_start > train_every and not test_dataset.rev_vocab[context].endswith('@@'): - # train - if dynamic: - tr_context, tr_target, tr_target_weights = file_data[train_start : subtoken_id - 1], \ - file_data[train_start + 1 : subtoken_id], \ - [1.0] * ((subtoken_id - 1) - train_start) - # start_train_at = subtoken_id - feed_dict = {self.inputd: np.tile(tr_context, (self.batch_size, 1)), - self.targets: np.tile(tr_target, (self.batch_size, 1)), - self.target_weights: np.tile(tr_target_weights, (self.batch_size, 1)), - self.learning_rate: config.learning_rate, - self.keep_probability: config.keep_prob - } - if FLAGS.gru: - for i, h in enumerate(self.reset_state): - feed_dict[h] = train_state[i] - else: # LSTM - for i, (c, h) in enumerate(self.reset_state): - feed_dict[c] = train_state[i].c - feed_dict[h] = train_state[i].h - _, cost, train_state, loss, iteration = session.run( - [self.train_step, self.cost, self.next_state, self.loss, self.iteration], feed_dict) - train_start = train_end - - feed_dict = {self.inputd: np.array([[context]] * self.batch_size), - self.targets: np.array([[target]] * self.batch_size), - self.target_weights: np.array([[1.0]] * self.batch_size), - self.keep_probability: 1.0 - } - if FLAGS.gru: - for i, h in enumerate(self.reset_state): - feed_dict[h] = state[i] - else: # LSTM - for i, (c, h) in enumerate(self.reset_state): - feed_dict[c] = state[i].c - feed_dict[h] = state[i].h - norm_logits, loss, cost, state = session.run([self.norm_logits, self.loss, self.cost, self.next_state], feed_dict) - - correct_word = test_dataset.rev_vocab[target] - if verbose: print('Correct:', correct_word) - # if train_start > 0 and is_id: - # self._score_cache_contents(session, config, beam_size, test_dataset, list(norm_logits[0]), id_cache, context, state) - - if correct_word.endswith('@@') or (token_map is not None and not file_tokens[subtoken_id]): - if not in_token: - correct_subtokens = [] - remember_state = state - logits = norm_logits[0] - # correct_token = correct_word[:-2] - correct_token = correct_word - else: - # correct_token += correct_word[:-2] - correct_token += correct_word - correct_subtokens.append(correct_word) - in_token = True - continue - else: - tokens_done += 1 - if not id_map is None and is_id: - identifiers += 1 - file_identifiers += 1 - if not in_token: - correct_subtokens = [] - remember_state = state - logits = norm_logits[0] - correct_token = correct_word - else: - correct_token += correct_word - in_token = False - correct_subtokens.append(correct_word) - - full_tokens_found = 0 - full_tokens = [] - - project_cache_preds = None - if is_id and id_cache.has_key(correct_token): - ids_in_cache += 1.0 - if is_id and project_id_cache.has_key(correct_token): - ids_in_project_cache += 1.0 - if is_id and tuple(context_history) in ngram_cache: - context_history_in_ngram_cache += 1 - # print('file context hit:', correct_token in ngram_cache[tuple(context_history)], len(ngram_cache[tuple(context_history)])) - file_cache_preds = self._score_cache_contents(session, config, beam_size, test_dataset, \ - list(norm_logits[0]), ngram_cache[tuple(context_history)], context, remember_state) - # print(file_cache_preds, correct_token.replace('@@', '')) - if is_id and tuple(context_history) in ngram_project_cache: - context_history_in_ngram_project_cache += 1 - # print('project context hit:', correct_token in ngram_project_cache[tuple(context_history)], len(ngram_project_cache[tuple(context_history)])) - project_cache_preds = self._score_cache_contents(session, config, beam_size, test_dataset, \ - list(norm_logits[0]), ngram_project_cache[tuple(context_history)], context, remember_state) - # print(project_cache_preds, correct_token.replace('@@', '')) - # print() - - # Rank single subtoken long predictions and keep top_needed (usually 10) best complete token ones - sorted = list(enumerate(logits)) - sorted.sort(key=itemgetter(1), reverse=True) - complete_done = 0 - prob_mass = 0.0 - counted = 0 - for id, prob in sorted: - if cache_ids and is_id and False: - word = test_dataset.rev_vocab[id] - counted += 1 - if not word.endswith('@@'): - if id_cache.has_key(word) or prob >= SKIP_CACHE_PROB_THRESHOLD: - complete_done += 1 - full_tokens.append((prob, word)) - prob_mass += prob - if complete_done >= top_needed: - break - if verbose: print(full_tokens) - else: - counted += 1 - word = test_dataset.rev_vocab[id] - if not word.endswith('@@'): - complete_done += 1 - full_tokens.append((prob, word)) - prob_mass += prob - if complete_done >= top_needed: - break - if verbose: print(full_tokens) - - # Probability mass greater than satisfaction_prob so output this prediction - if prob_mass > satisfaction_prob or counted == top_needed: - rank = 0 - correct_found = False - # if verbose: print('correct_token:', correct_token) - if verbose: print('correct_token:', correct_token.replace('@@', '')) - - # if cache_ids and is_id: - # # cache_predictions = self._score_cache_contents(session, config, beam_size, test_dataset, \ - # # list(norm_logits[0]), id_cache, context, remember_state) - # pred_scores = dict() - # for prob, pred in cache_predictions: - # pred_scores[pred] = CACHE_WEIGHT * prob - # for prob, prediction in full_tokens: - # prediction = prediction.replace('@@', '') - # if prediction in pred_scores: - # pred_scores[prediction] = pred_scores[prediction] + (1.0 - CACHE_WEIGHT) * prob - # else: - # prediction = (1.0 - CACHE_WEIGHT) * prob - # # sort - # f_tokens = [] - # for pred in pred_scores: - # f_tokens.append((pred_scores[pred], pred)) - # f_tokens.sort(reverse=True) - # full_tokens = f_tokens[: 10] - if cache_ids and is_id and project_cache_preds is not None: - pred_scores = dict() - for prob, pred in project_cache_preds: - pred_scores[pred] = CACHE_WEIGHT * prob - for prob, prediction in full_tokens: - prediction = prediction.replace('@@', '') - if prediction in pred_scores: - pred_scores[prediction] = pred_scores[prediction] + (1.0 - CACHE_WEIGHT) * prob - else: - prediction = (1.0 - CACHE_WEIGHT) * prob - # sort - f_tokens = [] - for pred in pred_scores: - f_tokens.append((pred_scores[pred], pred)) - f_tokens.sort(reverse=True) - full_tokens = f_tokens[: 10] - - for prob, prediction in full_tokens: - if FLAGS.token_model and correct_token == '-UNK-': - break - if (correct_token == '-UNK-' or '-UNK-' in correct_subtokens) and FLAGS.completion_unk_wrong: + mrr = 0.0 + id_mrr = 0.0 + id_acc1 = 0.0 + id_acc3 = 0.0 + id_acc5 = 0.0 + id_acc10 = 0.0 + + ids_in_cache = 0.0 + ids_in_project_cache = 0.0 + context_history_in_ngram_cache = 0.0 + context_history_in_ngram_project_cache = 0.0 + + satisfaction_prob = 0.8 + top_needed = 10 + verbose = FLAGS.verbose + last_test_project = None + train_every = config.num_steps + tokens_done = 0 + files_done = 0 + identifiers = 0 + file_identifiers = 0 + state = session.run(self.reset_state) + + context_size = FLAGS.cache_order - 1 + context_history = deque([None] * context_size, context_size) + ngram_cache = dict() + ngram_project_cache = dict() + # project_context_history = [] + id_cache = trie.CharTrie() + project_id_cache = trie.CharTrie() + CACHE_WEIGHT = FLAGS.file_cache_weight + PROJECT_CACHE_WEIGHT = FLAGS.file_cache_weight * 0.5 + SKIP_CACHE_PROB_THRESHOLD = 0.0 + + raw_data = test_dataset.data # is just one long array + data_len = len(raw_data) + print('Data Length:', data_len) + data_covered = 0 + end_file_id = test_dataset.vocab["-eod-"] + start_index = 0 + file_start_index = 0 + while data_covered < data_len: + # Stop when 1000000 test tokens have been scored. + if tokens_done > 1000000: break - if verbose: print(prob, prediction) - if not correct_found: - rank += 1 - # if prediction == correct_token: - if prediction.replace('@@', '') == correct_token.replace('@@', ''): - mrr += 1.0 / rank - correct_found = True - if verbose: print('MRR:', mrr / tokens_done) - if verbose: print() - - if is_id: - id_mrr += 1.0 / rank - if rank <= 1: - id_acc1 += 1.0 - if rank <= 3: - id_acc3 += 1.0 - if rank <= 5: - id_acc5 += 1.0 - if rank <= 10: - id_acc10 += 1.0 - if not correct_found: - rank += 1 - # if is_id: print(correct_token.replace('@@', ''), full_tokens, '\n') - # if cache_ids and is_id: - # print(rank) - # if rank > 10: - # print(correct_token, full_tokens, cache_predictions) - - if cache_ids and is_id and correct_token != '-UNK-': - id_cache[correct_token] = True - project_id_cache[correct_token] = True - if tuple(context_history) in ngram_cache: - ngram_cache[tuple(context_history)].add(correct_token) - else: - ngram_cache[tuple(context_history)] = set() - ngram_cache[tuple(context_history)].add(correct_token) - if tuple(context_history) in ngram_project_cache: - ngram_project_cache[tuple(context_history)].add(correct_token) + + # Create minibatches for the next file + while raw_data[data_covered] != end_file_id: + data_covered += 1 + data_covered += 1 # eod symbol + file_identifiers = 0 + + # Reset identifier cache for each file. + if cache_ids: + id_cache.clear() + ids_in_cache = 0.0 + ids_in_project_cache = 0.0 + context_history = deque([None] * context_size, context_size) + ngram_cache = dict() + context_history_in_ngram_cache = 0.0 + context_history_in_ngram_project_cache = 0.0 + + if dynamic: + test_project = test_projects[files_done] + if test_project != last_test_project: + # New test project so restore the model + ckpt = tf.train.get_checkpoint_state(FLAGS.train_dir) + if ckpt and tf.compat.v1.train.checkpoint_exists(ckpt.model_checkpoint_path): + self.saver.restore(session, ckpt.model_checkpoint_path) + + # Reset the project's cache of identifiers if one is used. + if cache_ids: + print('clearing project cache') + ids_in_project_cache = 0.0 + project_id_cache.clear() + ngram_project_cache = dict() + last_test_project = test_project + + file_data = raw_data[file_start_index:data_covered] + file_start_index = data_covered + print('Completion Length:', len(file_data)) + + if not id_map is None: + file_ids = id_map[files_done] + [0] else: - ngram_project_cache[tuple(context_history)] = set() - ngram_project_cache[tuple(context_history)].add(correct_token) - context_history.append(correct_token) - continue - if FLAGS.token_model: print('???') - - # Remember the score of the worst one out of the top_needed (usually 10) full_token candidates - if len(full_tokens) > 0: worst_full_score = full_tokens[-1][0] - else: worst_full_score = 0.0 - # Create a priority queue to rank predictions and continue the search - heapq.heapify(full_tokens) - # Now find beam_size best candidates to initialize the search - candidates_pq = [] - for id, prob in sorted: - word = test_dataset.rev_vocab[id] - if verbose: print(word, prob) - if word.endswith('@@'): - if cache_ids and is_id and False: - # if id_cache.has_subtrie(word[-2]) or prob >= SKIP_CACHE_PROB_THRESHOLD: - if id_cache.has_subtrie(word) or prob >= SKIP_CACHE_PROB_THRESHOLD: - # All the initial state vectors are the same so the first is used - # candidates_pq.append((-prob, Candidate(remember_state[0][0], id, word[:-2], -prob, - # tuple(tokens_before) + (word,)))) - candidates_pq.append((-prob, Candidate(remember_state[0][0], id, word, -prob, - tuple(tokens_before) + (word,)))) + file_ids = [0] * (len(file_data) - 1) + + if not token_map is None: + file_tokens = token_map[files_done] else: - # All the initial state vectors are the same so the first is used - # candidates_pq.append((-prob, Candidate(remember_state[0][0], id, word[:-2], -prob, - # tuple(tokens_before) + (word,)))) - candidates_pq.append((-prob, Candidate(remember_state[0][0], id, word, -prob, - tuple(tokens_before) + (word,)))) - if len(candidates_pq) >= beam_size: - break - heapq.heapify(candidates_pq) - full_tokens_scored = 0 - - # Keep creating candidates until 5000 have been created or total probability mass has exceeded satisfaction_prob - # Search can stop earlier if the best current candidate has score worst than that - # of the worst one of the initial full_tokens since it would be pointless to further continue the search - search_iterations = 0 - while full_tokens_scored < 5000 and prob_mass <= satisfaction_prob and search_iterations < 8: - search_iterations += 1 - # Create a beam of new candidates until 500 full tokens have been produced - to_expand = [] - new_state = (np.empty([beam_size, config.hidden_size]), ) - for c_id in range(beam_size): - if len(candidates_pq) == 0: - break - to_expand.append(heapq.heappop(candidates_pq)) - new_state[0][c_id] = to_expand[-1][1].get_state_vec() - - if len(to_expand) < beam_size: break - if -to_expand[0][1].get_parent_prob() < worst_full_score: - break - - feed_dict = {self.inputd: np.array([[candidate.get_id()] for (score, candidate) in to_expand]), - self.keep_probability: 1.0 - } - if FLAGS.gru: - for i, h in enumerate(self.reset_state): - feed_dict[h] = new_state[i] - else: - for i, (c, h) in enumerate(self.reset_state): - feed_dict[c] = new_state[i].c - feed_dict[h] = new_state[i].h - norm_logits, new_state = session.run([self.norm_logits, self.next_state], feed_dict) - for c_id in range(beam_size): - _, candidate = to_expand[c_id] - logits = norm_logits[c_id] - sorted = list(enumerate(logits)) - sorted.sort(key=itemgetter(1), reverse=True) - - for i in range(beam_size): - id, prob = sorted[i] - new_prob = candidate.get_parent_prob() * prob - if cache_ids and is_id and False: - if not test_dataset.rev_vocab[id].endswith('@@'): - if id_cache.has_key(candidate.get_text() + test_dataset.rev_vocab[id]) or new_prob >= SKIP_CACHE_PROB_THRESHOLD: - full_tokens_scored += 1 - prob_mass += -new_prob - heapq.heappushpop(full_tokens, (-new_prob, candidate.get_text() + test_dataset.rev_vocab[id])) - worst = heapq.nsmallest(1, full_tokens) - worst_full_score = worst[0][0] - else: - # word = test_dataset.rev_vocab[id][:-2] - word = test_dataset.rev_vocab[id] - if id_cache.has_subtrie(candidate.get_text() + word) or new_prob >= SKIP_CACHE_PROB_THRESHOLD: - heapq.heappush(candidates_pq, (new_prob, Candidate(new_state[0][c_id], id, candidate.get_text() + word, - new_prob, tuple(candidate.get_subtoken_history()) + - (test_dataset.rev_vocab[id],)))) - else: - if not test_dataset.rev_vocab[id].endswith('@@'): - full_tokens_scored += 1 - prob_mass += -new_prob - heapq.heappushpop(full_tokens, (-new_prob, candidate.get_text() + test_dataset.rev_vocab[id])) - worst = heapq.nsmallest(1, full_tokens) - worst_full_score = worst[0][0] + file_tokens = [0] * (len(file_data) - 1) + + tokens_before = deque([None, test_dataset.rev_vocab[file_data[0]]], 2) + + state = session.run(self.reset_state) + remember_state = state + train_state = session.run(self.reset_state) + in_token = False + + correct_token = '' + train_start = 0 + train_end = 0 + # to_add = [] + for subtoken_id, context_target_is_id in enumerate(zip(file_data[:-1], file_data[1:], file_ids)): + context, target, is_id = context_target_is_id + # print(test_dataset.rev_vocab[context], test_dataset.rev_vocab[target], is_id) + train_end += 1 + + # to_add.append(test_dataset.rev_vocab[context]) + if (subtoken_id - 1) - train_start > train_every and not test_dataset.rev_vocab[context].endswith('@@'): + # train + if dynamic: + tr_context, tr_target, tr_target_weights = file_data[train_start: subtoken_id - 1], \ + file_data[train_start + 1: subtoken_id], \ + [1.0] * ((subtoken_id - 1) - train_start) + # start_train_at = subtoken_id + feed_dict = {self.inputd: np.tile(tr_context, (self.batch_size, 1)), + self.targets: np.tile(tr_target, (self.batch_size, 1)), + self.target_weights: np.tile(tr_target_weights, (self.batch_size, 1)), + self.learning_rate: config.learning_rate, + self.keep_probability: config.keep_prob + } + if FLAGS.gru: + for i, h in enumerate(self.reset_state): + feed_dict[h] = train_state[i] + else: # LSTM + for i, (c, h) in enumerate(self.reset_state): + feed_dict[c] = train_state[i].c + feed_dict[h] = train_state[i].h + _, cost, train_state, loss, iteration = session.run( + [self.train_step, self.cost, self.next_state, self.loss, self.iteration], feed_dict) + train_start = train_end + + feed_dict = {self.inputd: np.array([[context]] * self.batch_size), + self.targets: np.array([[target]] * self.batch_size), + self.target_weights: np.array([[1.0]] * self.batch_size), + self.keep_probability: 1.0 + } + if FLAGS.gru: + for i, h in enumerate(self.reset_state): + feed_dict[h] = state[i] + else: # LSTM + for i, (c, h) in enumerate(self.reset_state): + feed_dict[c] = state[i].c + feed_dict[h] = state[i].h + norm_logits, loss, cost, state = session.run([self.norm_logits, self.loss, self.cost, self.next_state], + feed_dict) + + correct_word = test_dataset.rev_vocab[target] + if verbose: print('Correct:', correct_word) + # if train_start > 0 and is_id: + # self._score_cache_contents(session, config, beam_size, test_dataset, list(norm_logits[0]), id_cache, context, state) + + if correct_word.endswith('@@') or (token_map is not None and not file_tokens[subtoken_id]): + if not in_token: + correct_subtokens = [] + remember_state = state + logits = norm_logits[0] + # correct_token = correct_word[:-2] + correct_token = correct_word + else: + # correct_token += correct_word[:-2] + correct_token += correct_word + correct_subtokens.append(correct_word) + in_token = True + continue else: - # word = test_dataset.rev_vocab[id][:-2] - word = test_dataset.rev_vocab[id] - heapq.heappush(candidates_pq, (new_prob, Candidate(new_state[0][c_id], id, candidate.get_text() + word, - new_prob, tuple(candidate.get_subtoken_history()) + - (test_dataset.rev_vocab[id],)))) - - # Get top and count rank of correct answer - # if verbose: print('Correct_token:', correct_token) - if verbose: print('Correct_token:', correct_token.replace('@@', ''), correct_token) - if cache_ids: - if is_id: - id_cache[correct_token] = True - project_id_cache[correct_token] = True - if tuple(context_history) in ngram_cache: - ngram_cache[tuple(context_history)].add(correct_token) - else: - ngram_cache[tuple(context_history)] = set() - ngram_cache[tuple(context_history)].add(correct_token) - if tuple(context_history) in ngram_project_cache: - ngram_project_cache[tuple(context_history)].add(correct_token) - else: - ngram_project_cache[tuple(context_history)] = set() - ngram_project_cache[tuple(context_history)].add(correct_token) - context_history.append(correct_token) - - full_tokens.sort(reverse=True) - - # if cache_ids and is_id: - # # print('full_tokens:', full_tokens) - # # cache_predictions = self._score_cache_contents(session, config, beam_size, test_dataset, \ - # # list(norm_logits[0]), id_cache, context, remember_state) - # pred_scores = dict() - # for prob, pred in cache_predictions: - # pred_scores[pred] = CACHE_WEIGHT * prob - # for prob, prediction in full_tokens: - # prediction = prediction.replace('@@', '') - # if prediction in pred_scores: - # pred_scores[prediction] = pred_scores[prediction] + (1.0 - CACHE_WEIGHT) * prob - # else: - # prediction = (1.0 - CACHE_WEIGHT) * prob - # # sort - # f_tokens = [] - # for pred in pred_scores: - # f_tokens.append((pred_scores[pred], pred)) - # f_tokens.sort(reverse=True) - # full_tokens = f_tokens[: 10] - # # print('new full tokens:', full_tokens) - # # print(correct_token) - # # print() - - if cache_ids and is_id and project_cache_preds is not None: - pred_scores = dict() - for prob, pred in project_cache_preds: - pred_scores[pred] = CACHE_WEIGHT * prob - for prob, prediction in full_tokens: - prediction = prediction.replace('@@', '') - if prediction in pred_scores: - pred_scores[prediction] = pred_scores[prediction] + (1.0 - CACHE_WEIGHT) * prob - else: - prediction = (1.0 - CACHE_WEIGHT) * prob - # sort - f_tokens = [] - for pred in pred_scores: - f_tokens.append((pred_scores[pred], pred)) - f_tokens.sort(reverse=True) - full_tokens = f_tokens[: 10] - - correct_found = False - for i, answer in enumerate(full_tokens): - if (correct_token == '-UNK-' or '-UNK-' in correct_subtokens) and FLAGS.completion_unk_wrong: - break - prob, prediction = answer - if verbose: print(-prob, prediction) - if prediction.replace('@@', '') == correct_token.replace('@@', ''): - correct_found = True - mrr += 1.0 / (i + 1) - if verbose: print('MRR:', mrr / tokens_done) - if verbose: print() - - if is_id: - id_mrr += 1.0 / (i + 1) - if (i + 1) <= 1: - id_acc1 += 1.0 - if (i + 1) <= 3: - id_acc3 += 1.0 - if (i + 1) <= 5: - id_acc5 += 1.0 - if (i + 1) <= 10: - id_acc10 += 1.0 - break - if not correct_found: i += 1 - # if cache_ids and is_id: - # print(i + 1) - # if i + 1 > 10: - # print(correct_token, full_tokens, cache_predictions) - # if is_id: print(correct_token.replace('@@', ''), full_tokens, '\n') - files_done += 1 - if cache_ids: - print(id_cache) - - # Train on remainder - if dynamic and len(file_data) - train_start > 1: - tr_context, tr_target, tr_target_weights = file_data[train_start : -1], file_data[train_start + 1:], \ - [1.0] * len(file_data[train_start : -1]) - feed_dict = {self.inputd: np.tile(tr_context, (self.batch_size, 1)), - self.targets: np.tile(tr_target, (self.batch_size, 1)), - self.target_weights: np.tile(tr_target_weights, (self.batch_size, 1)), - self.learning_rate: config.learning_rate, - self.keep_probability: config.keep_prob - } - if FLAGS.gru: - for i, h in enumerate(self.reset_state): - feed_dict[h] = train_state[i] - else: # LSTM - for i, (c, h) in enumerate(self.reset_state): - feed_dict[c] = train_state[i].c - feed_dict[h] = train_state[i].h - _, cost, train_state, loss, iteration = session.run( - [self.train_step, self.cost, self.next_state, self.loss, self.iteration], feed_dict) - - print(files_done, 'MRR:', mrr / tokens_done) - if not id_map is None : - print(id_mrr / identifiers, id_acc1 / identifiers, id_acc3 / identifiers, \ - id_acc5 / identifiers, id_acc10 / identifiers) - if file_identifiers > 0: - print('File cache recall', ids_in_cache / file_identifiers) - print('Project cache recall', ids_in_project_cache / file_identifiers) - print('File context recall', context_history_in_ngram_cache / file_identifiers ) - print('Project context recall', context_history_in_ngram_project_cache / file_identifiers ) - - print('Tokens scored:', tokens_done) - return mrr / tokens_done - - - def _score_cache_contents(self, session, config, beam_size, test_dataset, logits, id_cache, context, state): - """[summary] + tokens_done += 1 + if not id_map is None and is_id: + identifiers += 1 + file_identifiers += 1 + if not in_token: + correct_subtokens = [] + remember_state = state + logits = norm_logits[0] + correct_token = correct_word + else: + correct_token += correct_word + in_token = False + correct_subtokens.append(correct_word) + + full_tokens_found = 0 + full_tokens = [] + + project_cache_preds = None + if is_id and id_cache.has_key(correct_token): + ids_in_cache += 1.0 + if is_id and project_id_cache.has_key(correct_token): + ids_in_project_cache += 1.0 + if is_id and tuple(context_history) in ngram_cache: + context_history_in_ngram_cache += 1 + # print('file context hit:', correct_token in ngram_cache[tuple(context_history)], len(ngram_cache[tuple(context_history)])) + file_cache_preds = self._score_cache_contents(session, config, beam_size, test_dataset, \ + list(norm_logits[0]), + ngram_cache[tuple(context_history)], context, + remember_state) + # print(file_cache_preds, correct_token.replace('@@', '')) + if is_id and tuple(context_history) in ngram_project_cache: + context_history_in_ngram_project_cache += 1 + # print('project context hit:', correct_token in ngram_project_cache[tuple(context_history)], len(ngram_project_cache[tuple(context_history)])) + project_cache_preds = self._score_cache_contents(session, config, beam_size, test_dataset, \ + list(norm_logits[0]), + ngram_project_cache[tuple(context_history)], + context, remember_state) + # print(project_cache_preds, correct_token.replace('@@', '')) + # print() + + # Rank single subtoken long predictions and keep top_needed (usually 10) best complete token ones + sorted = list(enumerate(logits)) + sorted.sort(key=itemgetter(1), reverse=True) + complete_done = 0 + prob_mass = 0.0 + counted = 0 + for id, prob in sorted: + if cache_ids and is_id and False: + word = test_dataset.rev_vocab[id] + counted += 1 + if not word.endswith('@@'): + if id_cache.has_key(word) or prob >= SKIP_CACHE_PROB_THRESHOLD: + complete_done += 1 + full_tokens.append((prob, word)) + prob_mass += prob + if complete_done >= top_needed: + break + if verbose: print(full_tokens) + else: + counted += 1 + word = test_dataset.rev_vocab[id] + if not word.endswith('@@'): + complete_done += 1 + full_tokens.append((prob, word)) + prob_mass += prob + if complete_done >= top_needed: + break + if verbose: print(full_tokens) + + # Probability mass greater than satisfaction_prob so output this prediction + if prob_mass > satisfaction_prob or counted == top_needed: + rank = 0 + correct_found = False + # if verbose: print('correct_token:', correct_token) + if verbose: print('correct_token:', correct_token.replace('@@', '')) + + # if cache_ids and is_id: + # # cache_predictions = self._score_cache_contents(session, config, beam_size, test_dataset, \ + # # list(norm_logits[0]), id_cache, context, remember_state) + # pred_scores = dict() + # for prob, pred in cache_predictions: + # pred_scores[pred] = CACHE_WEIGHT * prob + # for prob, prediction in full_tokens: + # prediction = prediction.replace('@@', '') + # if prediction in pred_scores: + # pred_scores[prediction] = pred_scores[prediction] + (1.0 - CACHE_WEIGHT) * prob + # else: + # prediction = (1.0 - CACHE_WEIGHT) * prob + # # sort + # f_tokens = [] + # for pred in pred_scores: + # f_tokens.append((pred_scores[pred], pred)) + # f_tokens.sort(reverse=True) + # full_tokens = f_tokens[: 10] + if cache_ids and is_id and project_cache_preds is not None: + pred_scores = dict() + for prob, pred in project_cache_preds: + pred_scores[pred] = CACHE_WEIGHT * prob + for prob, prediction in full_tokens: + prediction = prediction.replace('@@', '') + if prediction in pred_scores: + pred_scores[prediction] = pred_scores[prediction] + (1.0 - CACHE_WEIGHT) * prob + else: + prediction = (1.0 - CACHE_WEIGHT) * prob + # sort + f_tokens = [] + for pred in pred_scores: + f_tokens.append((pred_scores[pred], pred)) + f_tokens.sort(reverse=True) + full_tokens = f_tokens[: 10] + + for prob, prediction in full_tokens: + if FLAGS.token_model and correct_token == '-UNK-': + break + if (correct_token == '-UNK-' or '-UNK-' in correct_subtokens) and FLAGS.completion_unk_wrong: + break + if verbose: print(prob, prediction) + if not correct_found: + rank += 1 + # if prediction == correct_token: + if prediction.replace('@@', '') == correct_token.replace('@@', ''): + mrr += 1.0 / rank + correct_found = True + if verbose: print('MRR:', mrr / tokens_done) + if verbose: print() + + if is_id: + id_mrr += 1.0 / rank + if rank <= 1: + id_acc1 += 1.0 + if rank <= 3: + id_acc3 += 1.0 + if rank <= 5: + id_acc5 += 1.0 + if rank <= 10: + id_acc10 += 1.0 + if not correct_found: + rank += 1 + # if is_id: print(correct_token.replace('@@', ''), full_tokens, '\n') + # if cache_ids and is_id: + # print(rank) + # if rank > 10: + # print(correct_token, full_tokens, cache_predictions) + + if cache_ids and is_id and correct_token != '-UNK-': + id_cache[correct_token] = True + project_id_cache[correct_token] = True + if tuple(context_history) in ngram_cache: + ngram_cache[tuple(context_history)].add(correct_token) + else: + ngram_cache[tuple(context_history)] = set() + ngram_cache[tuple(context_history)].add(correct_token) + if tuple(context_history) in ngram_project_cache: + ngram_project_cache[tuple(context_history)].add(correct_token) + else: + ngram_project_cache[tuple(context_history)] = set() + ngram_project_cache[tuple(context_history)].add(correct_token) + context_history.append(correct_token) + continue + if FLAGS.token_model: print('???') + + # Remember the score of the worst one out of the top_needed (usually 10) full_token candidates + if len(full_tokens) > 0: + worst_full_score = full_tokens[-1][0] + else: + worst_full_score = 0.0 + # Create a priority queue to rank predictions and continue the search + heapq.heapify(full_tokens) + # Now find beam_size best candidates to initialize the search + candidates_pq = [] + for id, prob in sorted: + word = test_dataset.rev_vocab[id] + if verbose: print(word, prob) + if word.endswith('@@'): + if cache_ids and is_id and False: + # if id_cache.has_subtrie(word[-2]) or prob >= SKIP_CACHE_PROB_THRESHOLD: + if id_cache.has_subtrie(word) or prob >= SKIP_CACHE_PROB_THRESHOLD: + # All the initial state vectors are the same so the first is used + # candidates_pq.append((-prob, Candidate(remember_state[0][0], id, word[:-2], -prob, + # tuple(tokens_before) + (word,)))) + candidates_pq.append((-prob, Candidate(remember_state[0][0], id, word, -prob, + tuple(tokens_before) + (word,)))) + else: + # All the initial state vectors are the same so the first is used + # candidates_pq.append((-prob, Candidate(remember_state[0][0], id, word[:-2], -prob, + # tuple(tokens_before) + (word,)))) + candidates_pq.append((-prob, Candidate(remember_state[0][0], id, word, -prob, + tuple(tokens_before) + (word,)))) + if len(candidates_pq) >= beam_size: + break + heapq.heapify(candidates_pq) + full_tokens_scored = 0 + + # Keep creating candidates until 5000 have been created or total probability mass has exceeded satisfaction_prob + # Search can stop earlier if the best current candidate has score worst than that + # of the worst one of the initial full_tokens since it would be pointless to further continue the search + search_iterations = 0 + while full_tokens_scored < 5000 and prob_mass <= satisfaction_prob and search_iterations < 8: + search_iterations += 1 + # Create a beam of new candidates until 500 full tokens have been produced + to_expand = [] + new_state = (np.empty([beam_size, config.hidden_size]),) + for c_id in range(beam_size): + if len(candidates_pq) == 0: + break + to_expand.append(heapq.heappop(candidates_pq)) + new_state[0][c_id] = to_expand[-1][1].get_state_vec() + + if len(to_expand) < beam_size: break + if -to_expand[0][1].get_parent_prob() < worst_full_score: + break + + feed_dict = {self.inputd: np.array([[candidate.get_id()] for (score, candidate) in to_expand]), + self.keep_probability: 1.0 + } + if FLAGS.gru: + for i, h in enumerate(self.reset_state): + feed_dict[h] = new_state[i] + else: + for i, (c, h) in enumerate(self.reset_state): + feed_dict[c] = new_state[i].c + feed_dict[h] = new_state[i].h + + norm_logits, new_state = session.run([self.norm_logits, self.next_state], feed_dict) + for c_id in range(beam_size): + _, candidate = to_expand[c_id] + logits = norm_logits[c_id] + sorted = list(enumerate(logits)) + sorted.sort(key=itemgetter(1), reverse=True) + + for i in range(beam_size): + id, prob = sorted[i] + new_prob = candidate.get_parent_prob() * prob + if cache_ids and is_id and False: + if not test_dataset.rev_vocab[id].endswith('@@'): + if id_cache.has_key(candidate.get_text() + test_dataset.rev_vocab[ + id]) or new_prob >= SKIP_CACHE_PROB_THRESHOLD: + full_tokens_scored += 1 + prob_mass += -new_prob + heapq.heappushpop(full_tokens, ( + -new_prob, candidate.get_text() + test_dataset.rev_vocab[id])) + worst = heapq.nsmallest(1, full_tokens) + worst_full_score = worst[0][0] + else: + # word = test_dataset.rev_vocab[id][:-2] + word = test_dataset.rev_vocab[id] + if id_cache.has_subtrie( + candidate.get_text() + word) or new_prob >= SKIP_CACHE_PROB_THRESHOLD: + heapq.heappush(candidates_pq, ( + new_prob, Candidate(new_state[0][c_id], id, candidate.get_text() + word, + new_prob, tuple(candidate.get_subtoken_history()) + + (test_dataset.rev_vocab[id],)))) + else: + if not test_dataset.rev_vocab[id].endswith('@@'): + full_tokens_scored += 1 + prob_mass += -new_prob + heapq.heappushpop(full_tokens, + (-new_prob, candidate.get_text() + test_dataset.rev_vocab[id])) + worst = heapq.nsmallest(1, full_tokens) + worst_full_score = worst[0][0] + else: + # word = test_dataset.rev_vocab[id][:-2] + word = test_dataset.rev_vocab[id] + heapq.heappush(candidates_pq, ( + new_prob, Candidate(new_state[0][c_id], id, candidate.get_text() + word, + new_prob, tuple(candidate.get_subtoken_history()) + + (test_dataset.rev_vocab[id],)))) + + # Get top and count rank of correct answer + # if verbose: print('Correct_token:', correct_token) + if verbose: print('Correct_token:', correct_token.replace('@@', ''), correct_token) + if cache_ids: + if is_id: + id_cache[correct_token] = True + project_id_cache[correct_token] = True + if tuple(context_history) in ngram_cache: + ngram_cache[tuple(context_history)].add(correct_token) + else: + ngram_cache[tuple(context_history)] = set() + ngram_cache[tuple(context_history)].add(correct_token) + if tuple(context_history) in ngram_project_cache: + ngram_project_cache[tuple(context_history)].add(correct_token) + else: + ngram_project_cache[tuple(context_history)] = set() + ngram_project_cache[tuple(context_history)].add(correct_token) + context_history.append(correct_token) + + full_tokens.sort(reverse=True) + + # if cache_ids and is_id: + # # print('full_tokens:', full_tokens) + # # cache_predictions = self._score_cache_contents(session, config, beam_size, test_dataset, \ + # # list(norm_logits[0]), id_cache, context, remember_state) + # pred_scores = dict() + # for prob, pred in cache_predictions: + # pred_scores[pred] = CACHE_WEIGHT * prob + # for prob, prediction in full_tokens: + # prediction = prediction.replace('@@', '') + # if prediction in pred_scores: + # pred_scores[prediction] = pred_scores[prediction] + (1.0 - CACHE_WEIGHT) * prob + # else: + # prediction = (1.0 - CACHE_WEIGHT) * prob + # # sort + # f_tokens = [] + # for pred in pred_scores: + # f_tokens.append((pred_scores[pred], pred)) + # f_tokens.sort(reverse=True) + # full_tokens = f_tokens[: 10] + # # print('new full tokens:', full_tokens) + # # print(correct_token) + # # print() + + if cache_ids and is_id and project_cache_preds is not None: + pred_scores = dict() + for prob, pred in project_cache_preds: + pred_scores[pred] = CACHE_WEIGHT * prob + for prob, prediction in full_tokens: + prediction = prediction.replace('@@', '') + if prediction in pred_scores: + pred_scores[prediction] = pred_scores[prediction] + (1.0 - CACHE_WEIGHT) * prob + else: + prediction = (1.0 - CACHE_WEIGHT) * prob + # sort + f_tokens = [] + for pred in pred_scores: + f_tokens.append((pred_scores[pred], pred)) + f_tokens.sort(reverse=True) + full_tokens = f_tokens[: 10] + + correct_found = False + for i, answer in enumerate(full_tokens): + if (correct_token == '-UNK-' or '-UNK-' in correct_subtokens) and FLAGS.completion_unk_wrong: + break + prob, prediction = answer + if verbose: print(-prob, prediction) + if prediction.replace('@@', '') == correct_token.replace('@@', ''): + correct_found = True + mrr += 1.0 / (i + 1) + if verbose: print('MRR:', mrr / tokens_done) + if verbose: print() + + if is_id: + id_mrr += 1.0 / (i + 1) + if (i + 1) <= 1: + id_acc1 += 1.0 + if (i + 1) <= 3: + id_acc3 += 1.0 + if (i + 1) <= 5: + id_acc5 += 1.0 + if (i + 1) <= 10: + id_acc10 += 1.0 + break + if not correct_found: i += 1 + # if cache_ids and is_id: + # print(i + 1) + # if i + 1 > 10: + # print(correct_token, full_tokens, cache_predictions) + # if is_id: print(correct_token.replace('@@', ''), full_tokens, '\n') + files_done += 1 + if cache_ids: + print(id_cache) + + # Train on remainder + if dynamic and len(file_data) - train_start > 1: + tr_context, tr_target, tr_target_weights = file_data[train_start: -1], file_data[train_start + 1:], \ + [1.0] * len(file_data[train_start: -1]) + feed_dict = {self.inputd: np.tile(tr_context, (self.batch_size, 1)), + self.targets: np.tile(tr_target, (self.batch_size, 1)), + self.target_weights: np.tile(tr_target_weights, (self.batch_size, 1)), + self.learning_rate: config.learning_rate, + self.keep_probability: config.keep_prob + } + if FLAGS.gru: + for i, h in enumerate(self.reset_state): + feed_dict[h] = train_state[i] + else: # LSTM + for i, (c, h) in enumerate(self.reset_state): + feed_dict[c] = train_state[i].c + feed_dict[h] = train_state[i].h + _, cost, train_state, loss, iteration = session.run( + [self.train_step, self.cost, self.next_state, self.loss, self.iteration], feed_dict) + + print(files_done, 'MRR:', mrr / tokens_done) + if not id_map is None: + print(id_mrr / identifiers, id_acc1 / identifiers, id_acc3 / identifiers, \ + id_acc5 / identifiers, id_acc10 / identifiers) + if file_identifiers > 0: + print('File cache recall', ids_in_cache / file_identifiers) + print('Project cache recall', ids_in_project_cache / file_identifiers) + print('File context recall', context_history_in_ngram_cache / file_identifiers) + print('Project context recall', context_history_in_ngram_project_cache / file_identifiers) + + print('Tokens scored:', tokens_done) + return mrr / tokens_done + + def _score_cache_contents(self, session, config, beam_size, test_dataset, logits, id_cache, context, state): + """[summary] Arguments: session {[type]} -- [description] @@ -1501,99 +1590,103 @@ def _score_cache_contents(self, session, config, beam_size, test_dataset, logits Returns: [type] -- [description] """ - ranked_pred = [] - heapq.heapify(ranked_pred) - candidates_pq = [] - heapq.heapify(candidates_pq) - for identifier in id_cache: - # print(identifier) - if not '@@' in identifier: - index = test_dataset.vocab[identifier] - prob = logits[index] - # print(prob) - if len(ranked_pred) < 10: heapq.heappush(ranked_pred, (prob, identifier)) - else: heapq.heappushpop(ranked_pred, (prob, identifier)) - else: - identifier_parts = identifier.split('@@') - index = test_dataset.vocab[identifier_parts[0] + '@@'] - prob = logits[index] - candidates_pq.append((-prob, Candidate(state[0][0], index, identifier_parts, -prob, [0]))) - # unscored.append((identifier_parts, logits[index])) - # print(ranked_pred) - # print(candidates_pq) - - while len(candidates_pq) > 0: - to_expand = [] - new_state = (np.empty([beam_size, config.hidden_size]), ) - for c_id in range(beam_size): - if len(candidates_pq) == 0: - break - to_expand.append(heapq.heappop(candidates_pq)) - new_state[0][c_id] = to_expand[-1][1].get_state_vec() - - missing = beam_size - len(to_expand) - for m in range(missing): - to_expand.append((0.0, Candidate(state[0][0], 0, [], 0.0, [0]))) - - # if len(to_expand) < beam_size: break - - feed_dict = {self.inputd: np.array([[candidate.get_id()] for (score, candidate) in to_expand]), - self.keep_probability: 1.0 - } - if FLAGS.gru: - for i, h in enumerate(self.reset_state): - feed_dict[h] = new_state[i] - else: - for i, (c, h) in enumerate(self.reset_state): - feed_dict[c] = new_state[i].c - feed_dict[h] = new_state[i].h - norm_logits, new_state = session.run([self.norm_logits, self.next_state], feed_dict) - for c_id in range(beam_size): - _, candidate = to_expand[c_id] - if candidate.get_parent_prob() == 0.0: - continue - - logits = list(norm_logits[c_id]) - identifier_parts = candidate.get_text() - # print(identifier_parts) - next_part_index = candidate.get_subtoken_history()[-1] + 1 - if next_part_index == len(identifier_parts) - 1: - index = test_dataset.vocab[identifier_parts[next_part_index]] - prob = logits[index] - new_prob = candidate.get_parent_prob() * prob - candidate_text = ''.join(candidate.get_text()) - if len(ranked_pred) < 10: - # print('pushing full token:', (-new_prob, candidate_text)) - heapq.heappush(ranked_pred, (-new_prob, candidate_text)) - else: - # print('pushing full token:', (-new_prob, candidate_text)) - heapq.heappushpop(ranked_pred, (-new_prob, candidate_text)) - else: - index = test_dataset.vocab[identifier_parts[next_part_index] + '@@'] - prob = logits[index] - new_prob = candidate.get_parent_prob() * prob - # print('Pushing new candidate:', (new_prob, Candidate(new_state[0][c_id], index, candidate.get_text(), - # new_prob, list(candidate.get_subtoken_history()) + [next_part_index]))) - heapq.heappush(candidates_pq, (new_prob, Candidate(new_state[0][c_id], index, candidate.get_text(), - new_prob, list(candidate.get_subtoken_history()) + [next_part_index]))) - - ranked_pred.sort(reverse=True) - scores = np.asarray([prob for prob, token in ranked_pred]) - scores_sum = sum(scores) - scores = [score / scores_sum for score in scores] - # print(ranked_pred) - norm_pred = [] - for i in range(len(ranked_pred)): - # print(scores[i]) - norm_pred.append( (scores[i], ranked_pred[i][1]) ) - # print(candidates_pq) - # print('cache norm pred:', norm_pred) - # sys.exit(0) - return norm_pred - - - def maintenance_completion(self, session, config, test_lines, test_projects, train_vocab, train_vocab_rev, beam_size): - """ + ranked_pred = [] + heapq.heapify(ranked_pred) + candidates_pq = [] + heapq.heapify(candidates_pq) + for identifier in id_cache: + # print(identifier) + if not '@@' in identifier: + index = test_dataset.vocab[identifier] + prob = logits[index] + # print(prob) + if len(ranked_pred) < 10: + heapq.heappush(ranked_pred, (prob, identifier)) + else: + heapq.heappushpop(ranked_pred, (prob, identifier)) + else: + identifier_parts = identifier.split('@@') + index = test_dataset.vocab[identifier_parts[0] + '@@'] + prob = logits[index] + candidates_pq.append((-prob, Candidate(state[0][0], index, identifier_parts, -prob, [0]))) + # unscored.append((identifier_parts, logits[index])) + # print(ranked_pred) + # print(candidates_pq) + + while len(candidates_pq) > 0: + to_expand = [] + new_state = (np.empty([beam_size, config.hidden_size]),) + for c_id in range(beam_size): + if len(candidates_pq) == 0: + break + to_expand.append(heapq.heappop(candidates_pq)) + new_state[0][c_id] = to_expand[-1][1].get_state_vec() + + missing = beam_size - len(to_expand) + for m in range(missing): + to_expand.append((0.0, Candidate(state[0][0], 0, [], 0.0, [0]))) + + # if len(to_expand) < beam_size: break + + feed_dict = {self.inputd: np.array([[candidate.get_id()] for (score, candidate) in to_expand]), + self.keep_probability: 1.0 + } + if FLAGS.gru: + for i, h in enumerate(self.reset_state): + feed_dict[h] = new_state[i] + else: + for i, (c, h) in enumerate(self.reset_state): + feed_dict[c] = new_state[i].c + feed_dict[h] = new_state[i].h + norm_logits, new_state = session.run([self.norm_logits, self.next_state], feed_dict) + for c_id in range(beam_size): + _, candidate = to_expand[c_id] + if candidate.get_parent_prob() == 0.0: + continue + + logits = list(norm_logits[c_id]) + identifier_parts = candidate.get_text() + # print(identifier_parts) + next_part_index = candidate.get_subtoken_history()[-1] + 1 + if next_part_index == len(identifier_parts) - 1: + index = test_dataset.vocab[identifier_parts[next_part_index]] + prob = logits[index] + new_prob = candidate.get_parent_prob() * prob + candidate_text = ''.join(candidate.get_text()) + if len(ranked_pred) < 10: + # print('pushing full token:', (-new_prob, candidate_text)) + heapq.heappush(ranked_pred, (-new_prob, candidate_text)) + else: + # print('pushing full token:', (-new_prob, candidate_text)) + heapq.heappushpop(ranked_pred, (-new_prob, candidate_text)) + else: + index = test_dataset.vocab[identifier_parts[next_part_index] + '@@'] + prob = logits[index] + new_prob = candidate.get_parent_prob() * prob + # print('Pushing new candidate:', (new_prob, Candidate(new_state[0][c_id], index, candidate.get_text(), + # new_prob, list(candidate.get_subtoken_history()) + [next_part_index]))) + heapq.heappush(candidates_pq, (new_prob, Candidate(new_state[0][c_id], index, candidate.get_text(), + new_prob, + list(candidate.get_subtoken_history()) + [ + next_part_index]))) + + ranked_pred.sort(reverse=True) + scores = np.asarray([prob for prob, token in ranked_pred]) + scores_sum = sum(scores) + scores = [score / scores_sum for score in scores] + # print(ranked_pred) + norm_pred = [] + for i in range(len(ranked_pred)): + # print(scores[i]) + norm_pred.append((scores[i], ranked_pred[i][1])) + # print(candidates_pq) + # print('cache norm pred:', norm_pred) + # sys.exit(0) + return norm_pred + + def maintenance_completion(self, session, config, test_lines, test_projects, train_vocab, train_vocab_rev, + beam_size): + """ Runs the code completion for the maintenance scenario. :param session: The TF session in which operations should be run. :param config: The configuration to be used for the model. @@ -1604,421 +1697,430 @@ def maintenance_completion(self, session, config, test_lines, test_projects, tra :param beam_size: The size of the beam to be used by the search algorithm. :return: MRR for the test set. """ - # If checkpoint does not exist throw an exception - ckpt = tf.train.get_checkpoint_state(FLAGS.train_dir) - if not tf.train.checkpoint_exists(ckpt.model_checkpoint_path): - raise Exception('Checkpoint does not exist!') - - new_learning_rate = config.learning_rate - ctr = 0 - - mrr = 0.0 - satisfaction_prob = 0.8 - top_needed = 10 - verbose = False - train_every = 20 - tokens_done = 0 - - # First compute which files belong to each project. - project_sizes = [] - last_project_name = '' - project_file_size = 0 - for test_project in test_projects: - if test_project != last_project_name: - if last_project_name != '': - project_sizes.append(project_file_size) - project_file_size = 0 - else: - project_file_size += 1 - last_project_name = test_project - print(project_sizes) - print(sum(project_sizes)) - print() - - # Now distribute test lines based on project size - project_test_lines = [] - files_distributed = 0 - for project_file_size in project_sizes: - project_test_lines.append(test_lines[files_distributed : files_distributed + project_file_size]) - files_distributed += project_file_size - - partitions = 20 - for proj_id, test_lines in enumerate(project_test_lines): - print(len(test_lines)) - large_project = len(test_lines) > 200 - - if large_project: - # The project is very large so partition in partition_size parts. - # For each of the 20 parts train a model on all the parts excluding itself. - partition_size = len(test_lines) / partitions - for partition in range(partitions): - partition_words = [] - for i, line in enumerate(test_lines): - if i / partition_size != partition and not (i / partition_size == partitions + 1 and partition == partitions): - partition_words.extend([word for word in line.split(' ')]) - # If checkpoint does not exist throw an exception - ckpt = tf.train.get_checkpoint_state(FLAGS.train_dir) - if not tf.train.checkpoint_exists(ckpt.model_checkpoint_path): - raise Exception('Checkpoint does not exist!') - self.saver.restore(session, ckpt.model_checkpoint_path) - - partition_path = os.path.join(FLAGS.train_dir, "partition%d" % partition) - if os.path.isdir(partition_path): shutil.rmtree(partition_path) - - partition_ids = [train_vocab[word] if word in train_vocab else train_vocab['-UNK-'] for word in partition_words] - partition_dataset = reader.dataset(partition_ids, train_vocab, train_vocab_rev) - - # Reset the LSTM state to zeros and train - state = session.run(self.reset_state) - try: - epoch_log_perp_unnorm = epoch_total_weights = 0.0 - epoch_sub_total_weights = 0.0 - for step, (context, target, target_weights, sub_target_weights) in enumerate( - partition_dataset.batch_producer(1, self.num_steps)): - feed_dict = {self.inputd: np.tile(context, (self.batch_size, 1)), - self.targets: np.tile(target, (self.batch_size, 1)), - self.target_weights: np.tile(target_weights, (self.batch_size, 1)), - self.learning_rate: new_learning_rate, - self.keep_probability: config.keep_prob - } - if FLAGS.gru: - for i, h in enumerate(self.reset_state): - feed_dict[h] = state[i] - else: - for i, (c, h) in enumerate(self.reset_state): - feed_dict[c] = state[i].c - feed_dict[h] = state[i].h - _, cost, state, loss, iteration = session.run( - [self.train_step, self.cost, self.next_state, self.loss, self.iteration], feed_dict) - epoch_log_perp_unnorm += np.sum(loss) - epoch_total_weights += np.sum(sum(target_weights)) - epoch_sub_total_weights += np.sum(sum(sub_target_weights)) - # break - train_log_perplexity = epoch_log_perp_unnorm / epoch_total_weights - train_perplexity = math.exp(train_log_perplexity) if train_log_perplexity < 300 else float("inf") - print(train_perplexity) - checkpoint_path = os.path.join(partition_path, "model") - self.saver.save(session, checkpoint_path, global_step=self.global_step) - except (StopTrainingException, KeyboardInterrupt): - print("Finished training ........") - - for lines_done, test_line in enumerate(test_lines): - ctr += 1 - if tokens_done > 1000000: - return mrr / tokens_done - - # Restore the global model + # If checkpoint does not exist throw an exception ckpt = tf.train.get_checkpoint_state(FLAGS.train_dir) - if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path): - self.saver.restore(session, ckpt.model_checkpoint_path) - - if large_project: - # Load pretrained model and only train on the rest of the files from this partition - partition = lines_done / partition_size - partition = min(partitions - 1, partition) # last partition can be bigger - partition_path = os.path.join(FLAGS.train_dir, "partition%d" % partition) - part_ckpt = tf.train.get_checkpoint_state(partition_path) - self.saver.restore(session, part_ckpt.model_checkpoint_path) - - train_words = [] - if partition < partitions - 1: - partition_lines = zip(range(partition * partition_size, (partition + 1) * partition_size), - test_lines[partition * partition_size : (partition + 1) * partition_size]) - else: - partition_lines = zip(range(partition * partition_size, len(test_lines)), - test_lines[partition * partition_size : (partition + 1) * partition_size]) - for i, line in partition_lines: - if i != (lines_done % partition_size): - train_words.extend([word for word in line.split(' ')]) - else: - # Now for each file in the current test project use the rest as train data - train_words = [] - for i, line in enumerate(test_lines): - if i != lines_done: - train_words.extend([word for word in line.split(' ')]) - - # Convert the train data words to ids - ids = [train_vocab[word] if word in train_vocab else train_vocab['-UNK-'] for word in train_words] - train_dataset = reader.dataset(ids, train_vocab, train_vocab_rev) - - # Reset the LSTM state to zeros and train - state = session.run(self.reset_state) - try: - epoch_log_perp_unnorm = epoch_total_weights = 0.0 - epoch_sub_total_weights = 0.0 - for step, (context, target, target_weights, sub_target_weights) in enumerate( - train_dataset.batch_producer(1, self.num_steps)): - feed_dict = {self.inputd: np.tile(context, (self.batch_size, 1)), - self.targets: np.tile(target, (self.batch_size, 1)), - self.target_weights: np.tile(target_weights, (self.batch_size, 1)), - self.learning_rate: new_learning_rate, - self.keep_probability: config.keep_prob - } - if FLAGS.gru: - for i, h in enumerate(self.reset_state): - feed_dict[h] = state[i] - else: - for i, (c, h) in enumerate(self.reset_state): - feed_dict[c] = state[i].c - feed_dict[h] = state[i].h - # print("state number " + str(i)) - _, cost, state, loss, iteration = session.run([self.train_step, self.cost, self.next_state, self.loss, self.iteration], feed_dict) - epoch_log_perp_unnorm += np.sum(loss) - epoch_total_weights += np.sum(sum(target_weights)) - epoch_sub_total_weights += np.sum(sum(sub_target_weights)) - train_log_perplexity = epoch_log_perp_unnorm / epoch_total_weights - train_perplexity = math.exp(train_log_perplexity) if train_log_perplexity < 300 else float("inf") - # lengths.append(int(round(epoch_sub_total_weights, 0))) - - # Training done. Now test on test file - ids = [train_vocab[word] if word in train_vocab else train_vocab['-UNK-'] - for word in test_lines[lines_done].split(' ')] - - state = session.run(self.reset_state) - remember_state = state - train_state = session.run(self.reset_state) - in_token = False - correct_token = '' - train_start = 0 - train_end = 0 - - # Reset the LSTM state to zeros and train - for subtoken_id, context_target in enumerate(zip(ids[:-1], ids[1:])): - context, target = context_target - train_end += 1 - - if (subtoken_id - 1) - train_start > train_every and not train_vocab_rev[context].endswith('@@'): - # train - tr_context, tr_target, tr_target_weights = ids[train_start: subtoken_id - 1], \ - ids[train_start + 1: subtoken_id], \ - [1.0] * ((subtoken_id - 1) - train_start) - start_train_at = subtoken_id - feed_dict = {self.inputd: np.tile(tr_context, (self.batch_size, 1)), - self.targets: np.tile(tr_target, (self.batch_size, 1)), - self.target_weights: np.tile(tr_target_weights, (self.batch_size, 1)), - self.learning_rate: config.learning_rate, - self.keep_probability: config.keep_prob - } - if FLAGS.gru: - for i, h in enumerate(self.reset_state): - feed_dict[h] = train_state[i] - else: - for i, (c, h) in enumerate(self.reset_state): - feed_dict[c] = train_state[i].c - feed_dict[h] = train_state[i].h - _, cost, train_state, loss, iteration = session.run( - [self.train_step, self.cost, self.next_state, self.loss, self.iteration], feed_dict) - train_start = train_end - - feed_dict = {self.inputd: np.array([[context]] * self.batch_size), - self.targets: np.array([[target]] * self.batch_size), - self.target_weights: np.array([[1.0]] * self.batch_size), - self.keep_probability: 1.0 - } - if FLAGS.gru: - for i, h in enumerate(self.reset_state): - feed_dict[h] = state[i] - else: - for i, (c, h) in enumerate(self.reset_state): - feed_dict[c] = state[i].c - feed_dict[h] = state[i].h - norm_logits, loss, cost, state = session.run([self.norm_logits, self.loss, self.cost, self.next_state], - feed_dict) - correct_word = train_vocab_rev[target] - - if correct_word.endswith('@@'): - if not in_token: - correct_subtokens = [] - remember_state = state - logits = norm_logits[0] - correct_token = correct_word[:-2] - else: - correct_token += correct_word[:-2] - correct_subtokens.append(correct_word) - in_token = True - continue - else: - tokens_done += 1 - if not in_token: - correct_subtokens = [] - remember_state = state - logits = norm_logits[0] - correct_token = correct_word - else: - correct_token += correct_word - in_token = False - correct_subtokens.append(correct_word) - - full_tokens_found = 0 - full_tokens = [] - - # Rank single subtoken long predictions and keep top_needed (usually 10) best complete token ones - sorted = list(enumerate(logits)) - sorted.sort(key=itemgetter(1), reverse=True) - complete_done = 0 - prob_mass = 0.0 - counted = 0 - for id, prob in sorted: - counted += 1 - word = train_vocab_rev[id] - if not word.endswith('@@'): - complete_done += 1 - full_tokens.append((prob, word)) - prob_mass += prob - if complete_done >= top_needed: - break - - # Probability mass greater than satisfaction_prob so output this prediction - if prob_mass > satisfaction_prob or counted == top_needed: - rank = 0 - correct_found = False - if verbose: print(correct_token) - for prob, prediction in full_tokens: - if verbose: print(prob, prediction) - if not correct_found: - rank += 1 - if prediction == correct_token: - mrr += 1.0 / rank - correct_found = True - if verbose: print('MRR:', mrr / tokens_done) - if verbose: print() - continue - - # Remember the score of the worst one out of the top_needed (usually 10) full_token candidates - worst_full_score = full_tokens[-1][0] - # Create a priority queue to rank predictions and continue the search - heapq.heapify(full_tokens) - # Now find beam_size best candidates to initialize the search - candidates_pq = [] - for id, prob in sorted: - word = train_vocab_rev[id] - if word.endswith('@@'): - # All state vectors are the same so the first is used - candidates_pq.append((-prob, Candidate(remember_state[0][0], id, word[:-2], -prob, ''))) - if len(candidates_pq) >= beam_size: - break - heapq.heapify(candidates_pq) - full_tokens_scored = 0 - - # Keep creating candidates until 5000 have been created or total probability mass has exceeded satisfaction_prob - # Search can stop earlier if the best current candidate has score worst than that - # of the worst one of the initial full_tokens since it would be pointless to further continue the search - search_iterations = 0 - while full_tokens_scored < 5000 and prob_mass <= satisfaction_prob and search_iterations < 8: - search_iterations += 1 - # Create a beam of new candidates until 500 full tokens have been produced - to_expand = [] - new_state = (np.empty([beam_size, config.hidden_size]),) - for c_id in range(beam_size): - if len(candidates_pq) == 0: - break - to_expand.append(heapq.heappop(candidates_pq)) - new_state[0][c_id] = to_expand[-1][1].get_state_vec() - - if len(to_expand) < beam_size: break - if -to_expand[0][1].get_parent_prob() < worst_full_score: - break + if not tf.compat.v1.train.checkpoint_exists(ckpt.model_checkpoint_path): + raise Exception('Checkpoint does not exist!') - feed_dict = {self.inputd: np.array([[candidate.get_id()] for (score, candidate) in to_expand]), - self.keep_probability: 1.0 - } - if FLAGS.gru: - for i, h in enumerate(self.reset_state): - feed_dict[h] = new_state[i] - else: - for i, (c, h) in enumerate(self.reset_state): - feed_dict[c] = new_state[i].c - feed_dict[h] = new_state[i].h - norm_logits, new_state = session.run([self.norm_logits, self.next_state], feed_dict) - for c_id in range(beam_size): - if len(to_expand) <= c_id: break - _, candidate = to_expand[c_id] - logits = norm_logits[c_id] - sorted = list(enumerate(logits)) - sorted.sort(key=itemgetter(1), reverse=True) - for i in range(beam_size): - id, prob = sorted[i] - new_prob = candidate.get_parent_prob() * prob - if not train_vocab_rev[id].endswith('@@'): - full_tokens_scored += 1 - prob_mass += -new_prob - heapq.heappushpop(full_tokens, (-new_prob, candidate.get_text() + train_vocab_rev[id])) - worst = heapq.nsmallest(1, full_tokens) - worst_full_score = worst[0][0] - else: - word = train_vocab_rev[id][:-2] - heapq.heappush(candidates_pq, - (new_prob, Candidate(new_state[0][c_id], id, candidate.get_text() + word, - new_prob, ''))) - - # Get top and count rank of correct answer - if verbose: print(correct_token) - full_tokens.sort(reverse=True) - for i, answer in enumerate(full_tokens): - prob, prediction = answer - if verbose: print(-prob, prediction) - if prediction == correct_token: - mrr += 1.0 / (i + 1) - if verbose: print('MRR:', mrr / tokens_done) - if verbose: print() - - pass - # Train on remainder - if len(ids) - start_train_at > 1: - tr_context, tr_target, tr_target_weights = ids[train_start : -1], \ - ids[train_start + 1 : ], \ - [1.0] * len(ids[train_start : -1]) - feed_dict = {self.inputd: np.tile(tr_context, (self.batch_size, 1)), - self.targets: np.tile(tr_target, (self.batch_size, 1)), - self.target_weights: np.tile(tr_target_weights, (self.batch_size, 1)), - self.learning_rate: config.learning_rate, - self.keep_probability: config.keep_prob - } - if FLAGS.gru: - for i, h in enumerate(self.reset_state): - feed_dict[h] = train_state[i] + new_learning_rate = config.learning_rate + ctr = 0 + + mrr = 0.0 + satisfaction_prob = 0.8 + top_needed = 10 + verbose = False + train_every = 20 + tokens_done = 0 + + # First compute which files belong to each project. + project_sizes = [] + last_project_name = '' + project_file_size = 0 + for test_project in test_projects: + if test_project != last_project_name: + if last_project_name != '': + project_sizes.append(project_file_size) + project_file_size = 0 else: - for i, (c, h) in enumerate(self.reset_state): - feed_dict[c] = train_state[i].c - feed_dict[h] = train_state[i].h - _, cost, train_state, loss, iteration = session.run( - [self.train_step, self.cost, self.next_state, self.loss, self.iteration], feed_dict) - - print(ctr, 'MRR:', mrr / tokens_done) - sys.stdout.flush() - except (StopTrainingException, KeyboardInterrupt): - print("Finished training ........") - - print("Projects Done:", proj_id + 1) - return mrr / tokens_done + project_file_size += 1 + last_project_name = test_project + print(project_sizes) + print(sum(project_sizes)) + print() + + # Now distribute test lines based on project size + project_test_lines = [] + files_distributed = 0 + for project_file_size in project_sizes: + project_test_lines.append(test_lines[files_distributed: files_distributed + project_file_size]) + files_distributed += project_file_size - - def write_model_parameters(self, model_directory): - """ + partitions = 20 + for proj_id, test_lines in enumerate(project_test_lines): + print(len(test_lines)) + large_project = len(test_lines) > 200 + + if large_project: + # The project is very large so partition in partition_size parts. + # For each of the 20 parts train a model on all the parts excluding itself. + partition_size = len(test_lines) / partitions + for partition in range(partitions): + partition_words = [] + for i, line in enumerate(test_lines): + if i / partition_size != partition and not ( + i / partition_size == partitions + 1 and partition == partitions): + partition_words.extend([word for word in line.split(' ')]) + # If checkpoint does not exist throw an exception + ckpt = tf.train.get_checkpoint_state(FLAGS.train_dir) + if not tf.compat.v1.train.checkpoint_exists(ckpt.model_checkpoint_path): + raise Exception('Checkpoint does not exist!') + self.saver.restore(session, ckpt.model_checkpoint_path) + + partition_path = os.path.join(FLAGS.train_dir, "partition%d" % partition) + if os.path.isdir(partition_path): shutil.rmtree(partition_path) + + partition_ids = [train_vocab[word] if word in train_vocab else train_vocab['-UNK-'] for word in + partition_words] + partition_dataset = reader.dataset(partition_ids, train_vocab, train_vocab_rev) + + # Reset the LSTM state to zeros and train + state = session.run(self.reset_state) + try: + epoch_log_perp_unnorm = epoch_total_weights = 0.0 + epoch_sub_total_weights = 0.0 + for step, (context, target, target_weights, sub_target_weights) in enumerate( + partition_dataset.batch_producer(1, self.num_steps)): + feed_dict = {self.inputd: np.tile(context, (self.batch_size, 1)), + self.targets: np.tile(target, (self.batch_size, 1)), + self.target_weights: np.tile(target_weights, (self.batch_size, 1)), + self.learning_rate: new_learning_rate, + self.keep_probability: config.keep_prob + } + if FLAGS.gru: + for i, h in enumerate(self.reset_state): + feed_dict[h] = state[i] + else: + for i, (c, h) in enumerate(self.reset_state): + feed_dict[c] = state[i].c + feed_dict[h] = state[i].h + _, cost, state, loss, iteration = session.run( + [self.train_step, self.cost, self.next_state, self.loss, self.iteration], feed_dict) + epoch_log_perp_unnorm += np.sum(loss) + epoch_total_weights += np.sum(sum(target_weights)) + epoch_sub_total_weights += np.sum(sum(sub_target_weights)) + # break + train_log_perplexity = epoch_log_perp_unnorm / epoch_total_weights + train_perplexity = math.exp(train_log_perplexity) if train_log_perplexity < 300 else float( + "inf") + print(train_perplexity) + checkpoint_path = os.path.join(partition_path, "model") + self.saver.save(session, checkpoint_path, global_step=self.global_step) + except (StopTrainingException, KeyboardInterrupt): + print("Finished training ........") + + for lines_done, test_line in enumerate(test_lines): + ctr += 1 + if tokens_done > 1000000: + return mrr / tokens_done + + # Restore the global model + ckpt = tf.train.get_checkpoint_state(FLAGS.train_dir) + if ckpt and tf.compat.v1.train.checkpoint_exists(ckpt.model_checkpoint_path): + self.saver.restore(session, ckpt.model_checkpoint_path) + + if large_project: + # Load pretrained model and only train on the rest of the files from this partition + partition = lines_done / partition_size + partition = min(partitions - 1, partition) # last partition can be bigger + partition_path = os.path.join(FLAGS.train_dir, "partition%d" % partition) + part_ckpt = tf.train.get_checkpoint_state(partition_path) + self.saver.restore(session, part_ckpt.model_checkpoint_path) + + train_words = [] + if partition < partitions - 1: + partition_lines = zip(range(partition * partition_size, (partition + 1) * partition_size), + test_lines[partition * partition_size: (partition + 1) * partition_size]) + else: + partition_lines = zip(range(partition * partition_size, len(test_lines)), + test_lines[partition * partition_size: (partition + 1) * partition_size]) + for i, line in partition_lines: + if i != (lines_done % partition_size): + train_words.extend([word for word in line.split(' ')]) + else: + # Now for each file in the current test project use the rest as train data + train_words = [] + for i, line in enumerate(test_lines): + if i != lines_done: + train_words.extend([word for word in line.split(' ')]) + + # Convert the train data words to ids + ids = [train_vocab[word] if word in train_vocab else train_vocab['-UNK-'] for word in train_words] + train_dataset = reader.dataset(ids, train_vocab, train_vocab_rev) + + # Reset the LSTM state to zeros and train + state = session.run(self.reset_state) + try: + epoch_log_perp_unnorm = epoch_total_weights = 0.0 + epoch_sub_total_weights = 0.0 + for step, (context, target, target_weights, sub_target_weights) in enumerate( + train_dataset.batch_producer(1, self.num_steps)): + feed_dict = {self.inputd: np.tile(context, (self.batch_size, 1)), + self.targets: np.tile(target, (self.batch_size, 1)), + self.target_weights: np.tile(target_weights, (self.batch_size, 1)), + self.learning_rate: new_learning_rate, + self.keep_probability: config.keep_prob + } + if FLAGS.gru: + for i, h in enumerate(self.reset_state): + feed_dict[h] = state[i] + else: + for i, (c, h) in enumerate(self.reset_state): + feed_dict[c] = state[i].c + feed_dict[h] = state[i].h + # print("state number " + str(i)) + _, cost, state, loss, iteration = session.run( + [self.train_step, self.cost, self.next_state, self.loss, self.iteration], feed_dict) + epoch_log_perp_unnorm += np.sum(loss) + epoch_total_weights += np.sum(sum(target_weights)) + epoch_sub_total_weights += np.sum(sum(sub_target_weights)) + train_log_perplexity = epoch_log_perp_unnorm / epoch_total_weights + train_perplexity = math.exp(train_log_perplexity) if train_log_perplexity < 300 else float("inf") + # lengths.append(int(round(epoch_sub_total_weights, 0))) + + # Training done. Now test on test file + ids = [train_vocab[word] if word in train_vocab else train_vocab['-UNK-'] + for word in test_lines[lines_done].split(' ')] + + state = session.run(self.reset_state) + remember_state = state + train_state = session.run(self.reset_state) + in_token = False + correct_token = '' + train_start = 0 + train_end = 0 + + # Reset the LSTM state to zeros and train + for subtoken_id, context_target in enumerate(zip(ids[:-1], ids[1:])): + context, target = context_target + train_end += 1 + + if (subtoken_id - 1) - train_start > train_every and not train_vocab_rev[context].endswith( + '@@'): + # train + tr_context, tr_target, tr_target_weights = ids[train_start: subtoken_id - 1], \ + ids[train_start + 1: subtoken_id], \ + [1.0] * ((subtoken_id - 1) - train_start) + start_train_at = subtoken_id + feed_dict = {self.inputd: np.tile(tr_context, (self.batch_size, 1)), + self.targets: np.tile(tr_target, (self.batch_size, 1)), + self.target_weights: np.tile(tr_target_weights, (self.batch_size, 1)), + self.learning_rate: config.learning_rate, + self.keep_probability: config.keep_prob + } + if FLAGS.gru: + for i, h in enumerate(self.reset_state): + feed_dict[h] = train_state[i] + else: + for i, (c, h) in enumerate(self.reset_state): + feed_dict[c] = train_state[i].c + feed_dict[h] = train_state[i].h + _, cost, train_state, loss, iteration = session.run( + [self.train_step, self.cost, self.next_state, self.loss, self.iteration], feed_dict) + train_start = train_end + + feed_dict = {self.inputd: np.array([[context]] * self.batch_size), + self.targets: np.array([[target]] * self.batch_size), + self.target_weights: np.array([[1.0]] * self.batch_size), + self.keep_probability: 1.0 + } + if FLAGS.gru: + for i, h in enumerate(self.reset_state): + feed_dict[h] = state[i] + else: + for i, (c, h) in enumerate(self.reset_state): + feed_dict[c] = state[i].c + feed_dict[h] = state[i].h + norm_logits, loss, cost, state = session.run( + [self.norm_logits, self.loss, self.cost, self.next_state], + feed_dict) + correct_word = train_vocab_rev[target] + + if correct_word.endswith('@@'): + if not in_token: + correct_subtokens = [] + remember_state = state + logits = norm_logits[0] + correct_token = correct_word[:-2] + else: + correct_token += correct_word[:-2] + correct_subtokens.append(correct_word) + in_token = True + continue + else: + tokens_done += 1 + if not in_token: + correct_subtokens = [] + remember_state = state + logits = norm_logits[0] + correct_token = correct_word + else: + correct_token += correct_word + in_token = False + correct_subtokens.append(correct_word) + + full_tokens_found = 0 + full_tokens = [] + + # Rank single subtoken long predictions and keep top_needed (usually 10) best complete token ones + sorted = list(enumerate(logits)) + sorted.sort(key=itemgetter(1), reverse=True) + complete_done = 0 + prob_mass = 0.0 + counted = 0 + for id, prob in sorted: + counted += 1 + word = train_vocab_rev[id] + if not word.endswith('@@'): + complete_done += 1 + full_tokens.append((prob, word)) + prob_mass += prob + if complete_done >= top_needed: + break + + # Probability mass greater than satisfaction_prob so output this prediction + if prob_mass > satisfaction_prob or counted == top_needed: + rank = 0 + correct_found = False + if verbose: print(correct_token) + for prob, prediction in full_tokens: + if verbose: print(prob, prediction) + if not correct_found: + rank += 1 + if prediction == correct_token: + mrr += 1.0 / rank + correct_found = True + if verbose: print('MRR:', mrr / tokens_done) + if verbose: print() + continue + + # Remember the score of the worst one out of the top_needed (usually 10) full_token candidates + worst_full_score = full_tokens[-1][0] + # Create a priority queue to rank predictions and continue the search + heapq.heapify(full_tokens) + # Now find beam_size best candidates to initialize the search + candidates_pq = [] + for id, prob in sorted: + word = train_vocab_rev[id] + if word.endswith('@@'): + # All state vectors are the same so the first is used + candidates_pq.append((-prob, Candidate(remember_state[0][0], id, word[:-2], -prob, ''))) + if len(candidates_pq) >= beam_size: + break + heapq.heapify(candidates_pq) + full_tokens_scored = 0 + + # Keep creating candidates until 5000 have been created or total probability mass has exceeded satisfaction_prob + # Search can stop earlier if the best current candidate has score worst than that + # of the worst one of the initial full_tokens since it would be pointless to further continue the search + search_iterations = 0 + while full_tokens_scored < 5000 and prob_mass <= satisfaction_prob and search_iterations < 8: + search_iterations += 1 + # Create a beam of new candidates until 500 full tokens have been produced + to_expand = [] + new_state = (np.empty([beam_size, config.hidden_size]),) + for c_id in range(beam_size): + if len(candidates_pq) == 0: + break + to_expand.append(heapq.heappop(candidates_pq)) + new_state[0][c_id] = to_expand[-1][1].get_state_vec() + + if len(to_expand) < beam_size: break + if -to_expand[0][1].get_parent_prob() < worst_full_score: + break + + feed_dict = { + self.inputd: np.array([[candidate.get_id()] for (score, candidate) in to_expand]), + self.keep_probability: 1.0 + } + if FLAGS.gru: + for i, h in enumerate(self.reset_state): + feed_dict[h] = new_state[i] + else: + for i, (c, h) in enumerate(self.reset_state): + feed_dict[c] = new_state[i].c + feed_dict[h] = new_state[i].h + norm_logits, new_state = session.run([self.norm_logits, self.next_state], feed_dict) + for c_id in range(beam_size): + if len(to_expand) <= c_id: break + _, candidate = to_expand[c_id] + logits = norm_logits[c_id] + sorted = list(enumerate(logits)) + sorted.sort(key=itemgetter(1), reverse=True) + for i in range(beam_size): + id, prob = sorted[i] + new_prob = candidate.get_parent_prob() * prob + if not train_vocab_rev[id].endswith('@@'): + full_tokens_scored += 1 + prob_mass += -new_prob + heapq.heappushpop(full_tokens, + (-new_prob, candidate.get_text() + train_vocab_rev[id])) + worst = heapq.nsmallest(1, full_tokens) + worst_full_score = worst[0][0] + else: + word = train_vocab_rev[id][:-2] + heapq.heappush(candidates_pq, + (new_prob, + Candidate(new_state[0][c_id], id, candidate.get_text() + word, + new_prob, ''))) + + # Get top and count rank of correct answer + if verbose: print(correct_token) + full_tokens.sort(reverse=True) + for i, answer in enumerate(full_tokens): + prob, prediction = answer + if verbose: print(-prob, prediction) + if prediction == correct_token: + mrr += 1.0 / (i + 1) + if verbose: print('MRR:', mrr / tokens_done) + if verbose: print() + + pass + # Train on remainder + if len(ids) - start_train_at > 1: + tr_context, tr_target, tr_target_weights = ids[train_start: -1], \ + ids[train_start + 1:], \ + [1.0] * len(ids[train_start: -1]) + feed_dict = {self.inputd: np.tile(tr_context, (self.batch_size, 1)), + self.targets: np.tile(tr_target, (self.batch_size, 1)), + self.target_weights: np.tile(tr_target_weights, (self.batch_size, 1)), + self.learning_rate: config.learning_rate, + self.keep_probability: config.keep_prob + } + if FLAGS.gru: + for i, h in enumerate(self.reset_state): + feed_dict[h] = train_state[i] + else: + for i, (c, h) in enumerate(self.reset_state): + feed_dict[c] = train_state[i].c + feed_dict[h] = train_state[i].h + _, cost, train_state, loss, iteration = session.run( + [self.train_step, self.cost, self.next_state, self.loss, self.iteration], feed_dict) + + print(ctr, 'MRR:', mrr / tokens_done) + # sys.stdout.flush() + except (StopTrainingException, KeyboardInterrupt): + print("Finished training ........") + + print("Projects Done:", proj_id + 1) + return mrr / tokens_done + + def write_model_parameters(self, model_directory): + """ Saves basic model information. :param model_directory: :return: """ - parameters = { - "num_layers": str(self.num_layers), - "vocab_size": str(self.vocab_size), - "hidden_size": str(self.hidden_size), - "keep_probability": str(self.keep_probability), - "total_parameters": str(self.get_parameter_count()) - } - with open(self.parameters_file(model_directory), "w") as f: - json.dump(parameters, f, indent=4) - - @staticmethod - def parameters_file(model_directory): - return os.path.join(model_directory, "parameters.json") - - @staticmethod - def model_file(model_directory): - return os.path.join(model_directory, "model") + parameters = { + "num_layers": str(self.num_layers), + "vocab_size": str(self.vocab_size), + "hidden_size": str(self.hidden_size), + "keep_probability": str(self.keep_probability), + "total_parameters": str(self.get_parameter_count()) + } + with open(self.parameters_file(model_directory), "w") as f: + json.dump(parameters, f, indent=4) + + @staticmethod + def parameters_file(model_directory): + return os.path.join(model_directory, "parameters.json") + + @staticmethod + def model_file(model_directory): + return os.path.join(model_directory, "model") + def calculate_predictability(test_lines, train_vocab, train_vocab_rev, config, output_path, model, session): - """ + """ Computes predictability for each test file instance and returs the average. :param test_lines: :param train_vocab: @@ -2029,70 +2131,71 @@ def calculate_predictability(test_lines, train_vocab, train_vocab_rev, config, o :param session: The TF session in which operations should be run. :return: """ - config.batch_size = config.test_batch_size - ctr = 0 - predictabilities = [] - for test_line in test_lines: - ctr += 1 - if ctr % 1000 == 0: - print("\t %d lines" % ctr) - - test_line = test_line.replace("\n", (" %s" % "-eod-")) - ids = [train_vocab[word] - if word in train_vocab else train_vocab["-UNK-"] - for word in test_line.split(' ')] - test_dataset = reader.dataset(ids, train_vocab, train_vocab_rev) - file_predictability = model.test(session, config, test_dataset) - predictabilities.append(file_predictability) - average_predictability = sum(predictabilities) / ctr - return average_predictability + config.batch_size = config.test_batch_size + ctr = 0 + predictabilities = [] + for test_line in test_lines: + ctr += 1 + if ctr % 1000 == 0: + print("\t %d lines" % ctr) + + test_line = test_line.replace("\n", (" %s" % "-eod-")) + ids = [train_vocab[word] + if word in train_vocab else train_vocab["-UNK-"] + for word in test_line.split(' ')] + test_dataset = reader.dataset(ids, train_vocab, train_vocab_rev) + file_predictability = model.test(session, config, test_dataset) + predictabilities.append(file_predictability) + average_predictability = sum(predictabilities) / ctr + return average_predictability + def do_test(test_path, train_vocab, train_vocab_rev, config): - test_wids = reader._file_to_word_ids(test_path, train_vocab) - test_dataset = reader.dataset(test_wids, train_vocab, train_vocab_rev) - with tf.Graph().as_default(): - with tf.Session(config=get_gpu_config()) as session: - model = create_model(session, config) - model.train_vocab = train_vocab - test_perplexity = model.test(session, config, test_dataset) - print("\n\nTest perplexity is " + str(test_perplexity) + "\n") + test_wids = reader._file_to_word_ids(test_path, train_vocab) + test_dataset = reader.dataset(test_wids, train_vocab, train_vocab_rev) + with tf.Graph().as_default(): + with tf.compat.v1.Session(config=get_gpu_config()) as session: + model = create_model(session, config) + model.train_vocab = train_vocab + test_perplexity = model.test(session, config, test_dataset) + print("\n\nTest perplexity is " + str(test_perplexity) + "\n") + def create_model(session, config): - """ + """ Creates the NLM and restores its parameters if there is a saved checkpoint. :param session: The TF session in which operations will be run. :param config: The configuration to be used. :return: """ - model = NLM(config) - ckpt = tf.train.get_checkpoint_state(FLAGS.train_dir) - if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path): - print("Reading model parameters from %s" % ckpt.model_checkpoint_path) - model.saver.restore(session, ckpt.model_checkpoint_path) - else: - print("Created model with fresh parameters:") - session.run(tf.global_variables_initializer()) - print("*Number of parameters* = " + str(model.get_parameter_count())) - return model + model = NLM(config) + ckpt = tf.train.get_checkpoint_state(FLAGS.train_dir) + if ckpt and tf.compat.v1.train.checkpoint_exists(ckpt.model_checkpoint_path): + print("Reading model parameters from %s" % ckpt.model_checkpoint_path) + model.saver.restore(session, ckpt.model_checkpoint_path) + else: + print("Created model with fresh parameters:") + session.run(tf.compat.v1.global_variables_initializer()) + print("*Number of parameters* = " + str(model.get_parameter_count())) + return model class Config(object): - """Configuration""" - - def __init__(self, inits, lr, mgrad, nlayers, nsteps, hsize, mepoch, kp, decay, bsize, tbsize, vsize): - self.init_scale = inits - self.learning_rate = lr - self.max_grad_norm = mgrad - self.num_layers = nlayers - self.num_steps = nsteps - self.hidden_size = hsize - self.max_epoch = mepoch - self.keep_prob = kp - self.lr_decay = decay - self.batch_size = bsize - self.test_batch_size = tbsize - self.vocab_size = vsize - + """Configuration""" + + def __init__(self, inits, lr, mgrad, nlayers, nsteps, hsize, mepoch, kp, decay, bsize, tbsize, vsize): + self.init_scale = inits + self.learning_rate = lr + self.max_grad_norm = mgrad + self.num_layers = nlayers + self.num_steps = nsteps + self.hidden_size = hsize + self.max_epoch = mepoch + self.keep_prob = kp + self.lr_decay = decay + self.batch_size = bsize + self.test_batch_size = tbsize + self.vocab_size = vsize def main(_): @@ -2100,7 +2203,7 @@ def main(_): Handles argument parsing and runs the chosen scenario. """ if not FLAGS.data_path: - raise ValueError("Must set --data_path to directory with train/valid/test") + raise ValueError("Must set --data_path to directory with train/valid/test") config = Config(FLAGS.init_scale, FLAGS.learning_rate, FLAGS.max_grad_norm, FLAGS.num_layers, FLAGS.num_steps, FLAGS.hidden_size, FLAGS.max_epoch, FLAGS.keep_prob, FLAGS.lr_decay, FLAGS.batch_size, @@ -2109,223 +2212,229 @@ def main(_): exit_criteria = ExitCriteria(config.max_epoch) if FLAGS.predict: - # Runs the predictability scenario (per file entropy/perplexity). - vocab_path = FLAGS.train_dir + "/vocab.txt" - train_vocab, train_vocab_rev = reader._read_vocab(vocab_path) - config.vocab_size = len(train_vocab) - start_time = time.time() - - test_lines = [] - valid_file = FLAGS.data_path + '/' + FLAGS.validation_filename - with open(valid_file, 'r') as f: - test_lines = [line for line in f] - # test_lines = ['class Car { public static void main ( String [ ] args) }\n'] - with tf.Graph().as_default(): - with tf.Session(config=get_gpu_config()) as session: - model = create_model(session, config) - model.train_vocab = train_vocab - model.train_vocab_rev = train_vocab_rev - ppl = calculate_predictability(test_lines, train_vocab, train_vocab_rev, config, '', model, session) - print("Average:", ppl) - print("Total time %s" % timedelta(seconds=time.time() - start_time)) - print("Done computing predictability scores!") + # Runs the predictability scenario (per file entropy/perplexity). + vocab_path = FLAGS.train_dir + "/vocab.txt" + train_vocab, train_vocab_rev = reader._read_vocab(vocab_path) + config.vocab_size = len(train_vocab) + start_time = time.time() + + test_lines = [] + valid_file = FLAGS.data_path + '/' + FLAGS.validation_filename + with open(valid_file, 'r') as f: + test_lines = [line for line in f] + # test_lines = ['class Car { public static void main ( String [ ] args) }\n'] + with tf.Graph().as_default(): + with tf.compat.v1.Session(config=get_gpu_config()) as session: + model = create_model(session, config) + model.train_vocab = train_vocab + model.train_vocab_rev = train_vocab_rev + ppl = calculate_predictability(test_lines, train_vocab, train_vocab_rev, config, '', model, session) + print("Average:", ppl) + print("Total time %s" % timedelta(seconds=time.time() - start_time)) + print("Done computing predictability scores!") elif FLAGS.dynamic_test: - # Runs the dynamic adaptation scenario. - vocab_path = FLAGS.train_dir + "/vocab.txt" - train_vocab, train_vocab_rev = reader._read_vocab(vocab_path) - config.vocab_size = len(train_vocab) - start_time = time.time() - - test_lines = [] - test_file = FLAGS.data_path + '/' + FLAGS.test_filename - test_proj_file = FLAGS.data_path + '/' + FLAGS.test_proj_filename - with open(test_file, 'r') as f: - test_lines = [line for line in f] - with open(test_proj_file, 'r') as f: - test_proj_lines = [line for line in f] - # test_lines = ['class Car { public static void main ( String [ ] args) }\n'] - with tf.Graph().as_default(): - with tf.Session(config=get_gpu_config()) as session: - model = create_model(session, config) - model.train_vocab = train_vocab - model.train_vocab_rev = train_vocab_rev - perplexity = model.dynamic_train_test(test_lines, train_vocab, train_vocab_rev, - test_proj_lines, config, '', session) - print('Average perplexity:', perplexity) - print("Total time %s" % timedelta(seconds=time.time() - start_time)) - print("Done computing predictability scores!") + # Runs the dynamic adaptation scenario. + vocab_path = FLAGS.train_dir + "/vocab.txt" + train_vocab, train_vocab_rev = reader._read_vocab(vocab_path) + config.vocab_size = len(train_vocab) + start_time = time.time() + + test_lines = [] + test_file = FLAGS.data_path + '/' + FLAGS.test_filename + test_proj_file = FLAGS.data_path + '/' + FLAGS.test_proj_filename + with open(test_file, 'r') as f: + test_lines = [line for line in f] + with open(test_proj_file, 'r') as f: + test_proj_lines = [line for line in f] + # test_lines = ['class Car { public static void main ( String [ ] args) }\n'] + with tf.Graph().as_default(): + with tf.compat.v1.Session(config=get_gpu_config()) as session: + model = create_model(session, config) + model.train_vocab = train_vocab + model.train_vocab_rev = train_vocab_rev + perplexity = model.dynamic_train_test(test_lines, train_vocab, train_vocab_rev, + test_proj_lines, config, '', session) + print('Average perplexity:', perplexity) + print("Total time %s" % timedelta(seconds=time.time() - start_time)) + print("Done computing predictability scores!") elif FLAGS.maintenance_test: - # Runs the code maintenance scenario and calculates entropy/perplexity. - vocab_path = FLAGS.train_dir + "/vocab.txt" - train_vocab, train_vocab_rev = reader._read_vocab(vocab_path) - config.vocab_size = len(train_vocab) - start_time = time.time() - - test_lines = [] - test_file = FLAGS.data_path + '/' + FLAGS.test_filename - test_proj_file = FLAGS.data_path + '/' + FLAGS.test_proj_filename - with open(test_file, 'r') as f: - test_lines = [line.replace("\n", (" %s" % "-eod-")) for line in f] - with open(test_proj_file, 'r') as f: - test_proj_lines = [line.rstrip('\n') for line in f] - # test_lines = ['class Car { public static void main ( String [ ] args) }\n'] - with tf.Graph().as_default(): - with tf.Session(config=get_gpu_config()) as session: - model = create_model(session, config) - perplexity = model.maintenance_test(session, config, test_lines, test_proj_lines, train_vocab, train_vocab_rev) - print('Average perplexity:', perplexity) - print("Total time %s" % timedelta(seconds=time.time() - start_time)) - print("Done computing predictability scores!") + # Runs the code maintenance scenario and calculates entropy/perplexity. + vocab_path = FLAGS.train_dir + "/vocab.txt" + train_vocab, train_vocab_rev = reader._read_vocab(vocab_path) + config.vocab_size = len(train_vocab) + start_time = time.time() + + test_lines = [] + test_file = FLAGS.data_path + '/' + FLAGS.test_filename + test_proj_file = FLAGS.data_path + '/' + FLAGS.test_proj_filename + with open(test_file, 'r') as f: + test_lines = [line.replace("\n", (" %s" % "-eod-")) for line in f] + with open(test_proj_file, 'r') as f: + test_proj_lines = [line.rstrip('\n') for line in f] + # test_lines = ['class Car { public static void main ( String [ ] args) }\n'] + with tf.Graph().as_default(): + with tf.compat.v1.Session(config=get_gpu_config()) as session: + model = create_model(session, config) + perplexity = model.maintenance_test(session, config, test_lines, test_proj_lines, train_vocab, + train_vocab_rev) + print('Average perplexity:', perplexity) + print("Total time %s" % timedelta(seconds=time.time() - start_time)) + print("Done computing predictability scores!") elif FLAGS.test: - # Default test scenario. Essentially entropy/perplexity calculation. - vocab_path = FLAGS.train_dir + "/vocab.txt" - train_vocab, train_vocab_rev = reader._read_vocab(vocab_path) - print(len(train_vocab)) - config.vocab_size = len(train_vocab) - start_time = time.time() - do_test(FLAGS.data_path + "/" + FLAGS.test_filename, train_vocab, train_vocab_rev, config) - print("Total time %s" % timedelta(seconds=time.time() - start_time)) - print("Done testing!") + # Default test scenario. Essentially entropy/perplexity calculation. + vocab_path = FLAGS.train_dir + "/vocab.txt" + train_vocab, train_vocab_rev = reader._read_vocab(vocab_path) + print(len(train_vocab)) + config.vocab_size = len(train_vocab) + start_time = time.time() + do_test(FLAGS.data_path + "/" + FLAGS.test_filename, train_vocab, train_vocab_rev, config) + print("Total time %s" % timedelta(seconds=time.time() - start_time)) + print("Done testing!") elif FLAGS.completion: - # Runs the code completion scenario and calculates MRR if dynamic adaptation is on it also adapts the model. - vocab_path = FLAGS.train_dir + "/vocab.txt" - train_vocab, train_vocab_rev = reader._read_vocab(vocab_path) - config.vocab_size = len(train_vocab) - start_time = time.time() - test_wids = reader._file_to_word_ids(FLAGS.data_path + "/" + FLAGS.test_filename, train_vocab) - test_dataset = reader.dataset(test_wids, train_vocab, train_vocab_rev) - if FLAGS.dynamic: + # Runs the code completion scenario and calculates MRR if dynamic adaptation is on it also adapts the model. + t_dir = FLAGS.train_dir if FLAGS.train_dir else "." + vocab_path = t_dir + "/vocab.txt" + train_vocab, train_vocab_rev = reader._read_vocab(vocab_path) + config.vocab_size = len(train_vocab) + start_time = time.time() + test_wids = reader._file_to_word_ids(FLAGS.data_path + "/" + FLAGS.test_filename, train_vocab) + test_dataset = reader.dataset(test_wids, train_vocab, train_vocab_rev) + if FLAGS.dynamic: + test_proj_file = FLAGS.data_path + '/' + FLAGS.test_proj_filename + with open(test_proj_file, 'r') as f: + test_proj_lines = [line for line in f] + else: + test_proj_lines = [] + with tf.Graph().as_default(): + with tf.compat.v1.Session(config=get_gpu_config()) as session: + model = create_model(session, config) + model.train_vocab = train_vocab + model.train_vocab_rev = train_vocab_rev + + id_map = None + if FLAGS.identifier_map: + id_map = [] + with open(FLAGS.identifier_map, 'r') as f: + for line in f: + id_map.append(ast.literal_eval(line.rstrip('\n'))) + + token_map = None + if FLAGS.subtoken_map: + token_map = [] + with open(FLAGS.subtoken_map, 'r') as f: + for line in f: + token_map.append(ast.literal_eval(line.rstrip('\n'))) + + mrr = model.completion(session, config, test_dataset, test_proj_lines, config.batch_size, \ + FLAGS.dynamic, id_map, FLAGS.cache_ids, token_map) + print(mrr) + print("Total time %s" % timedelta(seconds=time.time() - start_time)) + print("Done completion!") + elif FLAGS.maintenance_completion: + # Runs the maintenance code completion scenario and calculates MRR. + vocab_path = FLAGS.train_dir + "/vocab.txt" + train_vocab, train_vocab_rev = reader._read_vocab(vocab_path) + config.vocab_size = len(train_vocab) + start_time = time.time() + test_lines = [] + test_file = FLAGS.data_path + '/' + FLAGS.test_filename test_proj_file = FLAGS.data_path + '/' + FLAGS.test_proj_filename + with open(test_file, 'r') as f: + test_lines = [line.replace("\n", (" %s" % "-eod-")) for line in f] with open(test_proj_file, 'r') as f: - test_proj_lines = [line for line in f] - else: - test_proj_lines = [] - with tf.Graph().as_default(): - with tf.Session(config=get_gpu_config()) as session: - model = create_model(session, config) - model.train_vocab = train_vocab - model.train_vocab_rev = train_vocab_rev - - id_map = None - if FLAGS.identifier_map: - id_map = [] - with open(FLAGS.identifier_map, 'r') as f: - for line in f: - id_map.append(ast.literal_eval(line.rstrip('\n'))) - - token_map = None - if FLAGS.subtoken_map: - token_map = [] - with open(FLAGS.subtoken_map, 'r') as f: - for line in f: - token_map.append(ast.literal_eval(line.rstrip('\n'))) - - mrr = model.completion(session, config, test_dataset, test_proj_lines, config.batch_size, \ - FLAGS.dynamic, id_map, FLAGS.cache_ids, token_map) - print(mrr) - print("Total time %s" % timedelta(seconds=time.time() - start_time)) - print("Done completion!") - elif FLAGS.maintenance_completion: - # Runs the maintenance code completion scenario and calculates MRR. - vocab_path = FLAGS.train_dir + "/vocab.txt" - train_vocab, train_vocab_rev = reader._read_vocab(vocab_path) - config.vocab_size = len(train_vocab) - start_time = time.time() - test_lines = [] - test_file = FLAGS.data_path + '/' + FLAGS.test_filename - test_proj_file = FLAGS.data_path + '/' + FLAGS.test_proj_filename - with open(test_file, 'r') as f: - test_lines = [line.replace("\n", (" %s" % "-eod-")) for line in f] - with open(test_proj_file, 'r') as f: - test_proj_lines = [line.rstrip('\n') for line in f] - - with tf.Graph().as_default(): - with tf.Session(config=get_gpu_config()) as session: - model = create_model(session, config) - mrr = model.maintenance_completion(session, config, test_lines, test_proj_lines, train_vocab, - train_vocab_rev, config.batch_size) - print('Final MRR:', mrr) - print("Total time %s" % timedelta(seconds=time.time() - start_time)) - print("Done computing mean reciprocal rank!") + test_proj_lines = [line.rstrip('\n') for line in f] + + with tf.Graph().as_default(): + with tf.compat.v1.Session(config=get_gpu_config()) as session: + model = create_model(session, config) + mrr = model.maintenance_completion(session, config, test_lines, test_proj_lines, train_vocab, + train_vocab_rev, config.batch_size) + print('Final MRR:', mrr) + print("Total time %s" % timedelta(seconds=time.time() - start_time)) + print("Done computing mean reciprocal rank!") else: - # Default scenario. Trains on training set and calculates entropy/perplexity for each epoch on the validation set. - train_file = FLAGS.data_path + '/' + FLAGS.train_filename #"/java_10M_train_bpe" - valid_file = FLAGS.data_path + '/' + FLAGS.validation_filename #"/java_validation_10%_sample_bpe" - train_vocab, train_vocab_rev = reader._build_vocab(train_file, FLAGS.thresh) - print("Vocabulary size:", len(train_vocab)) - config.vocab_size = len(train_vocab) # change so that vocab also reflects UNK, EMPTY, EOS etc - reader._write_vocab(train_vocab, FLAGS.train_dir + "/vocab.txt") - - train_wids = reader._file_to_word_ids(train_file, train_vocab) - train_dataset = reader.dataset(train_wids, train_vocab, train_vocab_rev) - val_wids = reader._file_to_word_ids(valid_file, train_vocab) - valid_dataset = reader.dataset(val_wids, train_vocab, train_vocab_rev) - del train_wids - del val_wids - - start_time = time.time() - with tf.Graph().as_default(): - with tf.Session(config=get_gpu_config()) as session: - md = create_model(session, config) - md.train_vocab = train_vocab - md.train_vocab_rev = train_vocab_rev - md.write_model_parameters(FLAGS.train_dir) - md.train(session, config, train_dataset, exit_criteria, valid_dataset, FLAGS.train_dir) - print("Total time %s" % timedelta(seconds=time.time() - start_time)) - print("Done training!") + # Default scenario. Trains on training set and calculates entropy/perplexity for each epoch on the validation set. + train_file = FLAGS.data_path + '/' + FLAGS.train_filename # "/java_10M_train_bpe" + valid_file = FLAGS.data_path + '/' + FLAGS.validation_filename # "/java_validation_10%_sample_bpe" + train_vocab, train_vocab_rev = reader._build_vocab(train_file, FLAGS.thresh) + print("Vocabulary size:", len(train_vocab)) + config.vocab_size = len(train_vocab) # change so that vocab also reflects UNK, EMPTY, EOS etc + reader._write_vocab(train_vocab, FLAGS.train_dir + "/vocab.txt") + + train_wids = reader._file_to_word_ids(train_file, train_vocab) + train_dataset = reader.dataset(train_wids, train_vocab, train_vocab_rev) + val_wids = reader._file_to_word_ids(valid_file, train_vocab) + valid_dataset = reader.dataset(val_wids, train_vocab, train_vocab_rev) + del train_wids + del val_wids + + start_time = time.time() + with tf.Graph().as_default(): + with tf.compat.v1.Session(config=get_gpu_config()) as session: + md = create_model(session, config) + md.train_vocab = train_vocab + md.train_vocab_rev = train_vocab_rev + md.write_model_parameters(FLAGS.train_dir) + md.train(session, config, train_dataset, exit_criteria, valid_dataset, FLAGS.train_dir) + print("Total time %s" % timedelta(seconds=time.time() - start_time)) + print("Done training!") + class StopTrainingException(Exception): - pass + pass class ExitCriteria(object): - """ + """ Defines the criteria needed for training termination. """ - def __init__(self, max_epochs): - self.max_epochs = max_epochs + + def __init__(self, max_epochs): + self.max_epochs = max_epochs + class Candidate(object): - """ + """ Represents a code completion search candidate. """ - def __init__(self, state_vec, id, token_text, parent_prob, subtoken_history): - self._state_vec = state_vec - self._id = id - self._token_text = token_text - self._parent_prob = parent_prob - self._subtoken_history = subtoken_history - def get_state_vec(self): - return self._state_vec + def __init__(self, state_vec, id, token_text, parent_prob, subtoken_history): + self._state_vec = state_vec + self._id = id + self._token_text = token_text + self._parent_prob = parent_prob + self._subtoken_history = subtoken_history + + def get_state_vec(self): + return self._state_vec + + def get_id(self): + return self._id + + def get_text(self): + return self._token_text - def get_id(self): - return self._id + def get_parent_prob(self): + return self._parent_prob - def get_text(self): - return self._token_text + def get_subtoken_history(self): + return tuple(self._subtoken_history) - def get_parent_prob(self): - return self._parent_prob + def __eq__(self, other): + return self._token_text == other._token_text - def get_subtoken_history(self): - return tuple(self._subtoken_history) - - def __eq__(self, other): - return self._token_text == other._token_text - - def __lt__(self, other): - return self._token_text < other._token_text + def __lt__(self, other): + return self._token_text < other._token_text - def __gt__(self, other): - return self._token_text > other._token_text + def __gt__(self, other): + return self._token_text > other._token_text - def __le__(self, other): - return self._token_text <= other._token_text + def __le__(self, other): + return self._token_text <= other._token_text - def __ge__(self, other): - return self._token_text >= other._token_text + def __ge__(self, other): + return self._token_text >= other._token_text -if __name__=="__main__": - tf.app.run() +if __name__ == "__main__": + tf.compat.v1.app.run() diff --git a/docker-compose.yaml b/docker-compose.yaml new file mode 100644 index 0000000..479f83e --- /dev/null +++ b/docker-compose.yaml @@ -0,0 +1,36 @@ +version: '3.8' + +services: + experiment: + image: ciselab/openvocabcodenlm + oom_kill_disable: true + volumes: + - ./sample_data/java/:/data/java/ + - ./sample_data/java/model:/models + build: + context: . + dockerfile: Dockerfile + environment: + DO_TRAIN: true + DO_TEST: true + DO_COMPLETION: false + VERBOSE: true + + EPOCHS: 2 + + BATCH_SIZE: 16 + + TRAIN_FILE: java_training_slp_pre_enc_bpe_10000 + VALIDATION_FILE: java_validation_slp_pre_enc_bpe_10000 + TEST_FILE: java_test_slp_pre_enc_bpe_10000 + TEST_PROJ_NAMES_FILE: testProjects + ID_MAP_FILE: /data/java/id_map_java_test_slp_pre_bpe_10000 + # For the dirs, be careful not to add "/" at the end, it will confuse the python + DATA_HOME: /data/java + MODEL_DIR: /models + + deploy: + resources: + reservations: + devices: + - capabilities: [gpu] \ No newline at end of file diff --git a/entrypoint.sh b/entrypoint.sh new file mode 100644 index 0000000..212f90a --- /dev/null +++ b/entrypoint.sh @@ -0,0 +1,41 @@ +#!/bin/bash +# Entrypoint for OpenVocabCodeNLM Experiment + +# This file invokes the original python code of the openvocabcodenlm with the environment variables set in the docker container. +# Additionally, it does a switch-case which flags for training, validation and testing have been set. + + +# Run mkdir Model_dir, in case it does not exist yet (if you start with training) +mkdir $MODEL_DIR + + +# Training the model +if [ "$DO_TRAIN" = true ]; then + if [ "$VERBOSE" = true ]; then + python code_nlm.py --data_path $DATA_HOME --train_dir $MODEL_DIR --train_filename $TRAIN_FILE --validation_filename $VALIDATION_FILE --gru True --hidden_size $STATE_DIMS --batch_size $BATCH_SIZE --word_level_perplexity True --cross_entropy True --steps_per_checkpoint $CHECKPOINT_EVERY --max_epoch $EPOCHS --verbose True + else + python code_nlm.py --data_path $DATA_HOME --train_dir $MODEL_DIR --train_filename $TRAIN_FILE --validation_filename $VALIDATION_FILE --gru True --hidden_size $STATE_DIMS --batch_size $BATCH_SIZE --word_level_perplexity True --cross_entropy True --steps_per_checkpoint $CHECKPOINT_EVERY --max_epoch $EPOCHS + fi +fi + +# Testing the model (Calculating test set entropy) +if [ "$DO_TEST" = true ]; then + if [ "$VERBOSE" = true ]; then + python code_nlm.py --test True --data_path $DATA_HOME --train_dir $MODEL_DIR --test_filename $TEST_FILE --gru True --batch_size $BATCH_SIZE --word_level_perplexity True --cross_entropy True --verbose True + else + python code_nlm.py --test True --data_path $DATA_HOME --train_dir $MODEL_DIR --test_filename $TEST_FILE --gru True --batch_size $BATCH_SIZE --word_level_perplexity True --cross_entropy True + fi +fi + +# Code completion +if [ "$DO_COMPLETION" = true ]; then + if [ "$VERBOSE" = true ]; then + python code_nlm.py --completion True --data_path $DATA_HOME --train_dir $MODEL_DIR --test_filename $TEST_FILE --gru True --batch_size $BATCH_SIZE --verbose True + else + python code_nlm.py --completion True --data_path $DATA_HOME --train_dir $MODEL_DIR --test_filename $TEST_FILE --gru True --batch_size $BATCH_SIZE + fi +fi + +# Add this to keep the container open (e.g. for debugging or inspection) +#echo "Entrypoint finished - keeping container artifially open ..." +#tail -f /dev/null \ No newline at end of file diff --git a/example.sh b/example.sh index 18dc64a..9bda335 100755 --- a/example.sh +++ b/example.sh @@ -14,7 +14,7 @@ TEST_PROJ_NAMES_FILE=testProjects ID_MAP_FILE=sample_data/java/id_map_java_test_slp_pre_bpe_10000 # Maximum training epochs -EPOCHS=5 +EPOCHS=2 # Initial learning rate LR=0.1 # This is the default value. You can skip it if you don't want to change it. # Training batch size diff --git a/non-ascii_sequences_to_unk.py b/non-ascii_sequences_to_unk.py index 193ece4..e69de29 100644 --- a/non-ascii_sequences_to_unk.py +++ b/non-ascii_sequences_to_unk.py @@ -1,26 +0,0 @@ -import sys - -UNKNOWN_WORD = "-UNK-" - -def non_ascii_seq_to_unk(source_file, destination_file): - with open(source_file, 'r') as rf: - with open(destination_file, 'w') as wf: - for line in rf: - in_non_ascii_seq = False - for char in line: - if ord(char) < 128: - if in_non_ascii_seq: - wf.write(UNKNOWN_WORD) - in_non_ascii_seq = False - wf.write(char) - else: - in_non_ascii_seq = True - - - - -if __name__=="__main__": - if len(sys.argv) != 3: - print 'Usage non-ascii_sequences_to_unk.py source_file destination_file' - sys.exit(1) - non_ascii_seq_to_unk( sys.argv[1], sys.argv[2] ) diff --git a/original_README.md b/original_README.md new file mode 100644 index 0000000..889b82e --- /dev/null +++ b/original_README.md @@ -0,0 +1,170 @@ +# OpenVocabNLMs +Contains the code for our ICSE 2020 submission: open vocabulary language model for source code that uses the byte pair encoding algorithm to learn a segmentation of code tokens into subtokens. + +If you use our code/implementation, datasets or pre-trained models please cite our paper: +@inproceedings{Karampatsis2020ICSE,\ +author = {Karampatsis, Rafael - Michael and Babii, Hlib and Robbes, Romain and Sutton, Charles and Janes, Andrea},\ +title = {{Big Code != Big Vocabulary: Open-Vocabulary Models for Source code}},\ +year = {2020},\ +publisher = {ACM},\ +url = {https://doi.org/10.1145/3377811.3380342}, \ +doi = {10.1145/3377811.3380342},\ +booktitle = {Proceedings of the 42nd International Conference on Software Engineering},\ +pages = {},\ +numpages = {11},\ +location = {Seoul, South Korea},\ +series = {ICSE ’20}\ +} + + +# Code Structure +**non-ascii_sequences_to_unk.py** is a preprocessing script that can be used to remove non-ascii sequences from the data and replace them with a special symbol. + +**create_subtoken_data.py** is also a preprocessing script that can be used to subtokenize data based on the heuristic of [Allamanis et al. (2015)](https://miltos.allamanis.com/publications/2015suggesting/). + +**reader.py** contains utility functions for reading data and providing batches for training and testing of models. + +**code_nlm.py** contains the implementation of our NLM for code and supports training, perplexity/cross-entropy calculation, code-completion simulation as well as dynamic versions of the test scenarios. The updated implementation has also some new features, previously not present in the code. That is measuring identifier specific performance for code completion. Another new feature implements a simple n-gram cache for identifiers that better simulates use of the model in an IDE where such information would be present. In order to use the identifier features a file containing identifier information must be provided through the options. + +# Installation + +Python>2.7.6 or Python==3.6 is required! +Python>3.6 is not supported due to the tensorflow version not supporting it. + +```shell script +git clone https://github.com/mast-group/OpenVocabCodeNLM +cd OpenVocabCodeNLM +pip install -r requirements.txt #python2 +pip3 install -r requirements.txt #python3 +``` +The experiments in the paper were performed using Python 2.7.14 but we have currently not experienced any unresolved issue with Python 3.
+In case you encounter any issues please open a new issue entry. + + +# Usage Instructions +If you want to try the implementation unzip the directory containing the sample data. +The sample data contain the small training set, validation, and test set used in the paper with a BPE encdoding size of 10000. + +## Option Constants +Let's first define constants for pointing to the data and network parameters. You'll need to modify these to point to your own data and satisfy the hyperparameters that you want to use. +``` +# Directory that contains train/validation/test data etc. +DATA_HOME=sample_data/java/ +# Directory in which the model will be saved. +MODEL_DIR=sample_data/java/model +mkdir $MODEL_DIR + +# Filenames +TRAIN_FILE=java_training_slp_pre_enc_bpe_10000 +VALIDATION_FILE=java_validation_slp_pre_enc_bpe_10000 +TEST_FILE=java_test_slp_pre_enc_bpe_10000 +TEST_PROJ_NAMES_FILE=testProjects +ID_MAP_FILE=sample_data/java/id_map_java_test_slp_pre_bpe_10000 + +# Maximum training epochs +EPOCHS=2 # Normally this would be larger. For instance 30-50 +# Initial learning rate +LR=0.1 # This is the default value. You can skip it if you don't want to change it. +# Training batch size +BATCH_SIZE=32 # This is also the default. +# RNN unroll timesteps for gradient calculation. +STEPS=20 # 20-50 is a good range of values for dynamic experiments. +# 1 - Dropout probability +KEEP_PROB=0.5 # This is also the default. +# RNN hidden state size +STATE_DIMS=512 # This is also the default. +# Checkpoint and validation loss calculation frequency. +CHECKPOINT_EVERY=5000 # This is also the default. + + +# Understanding boolean options. +# Most boolean options are set to False by default. +# For using any boolean option set it to True. +# For instance for using a GRU instead of an LSTM add to your command the option --gru True. +``` + + +We next present the various scenarios supported by our implementation. + +## Training +The training scenario creates a global model by training on the provided to it training data. +We will train a Java model with a BPE encoding of 10000 using the sample data. +In the following training example we set some of the hyperparameters (to their default values though). +Optionally, you can set all of them to your intented values. +Since the data is tokenized into subwords we need to let the script know so that it can calculate the metrics correctly. +For this reason we need to set the *word_level_perplexity* flag to **True**. +In order to also output validation cross-entropy instead of perplexity we set the *cross_entropy* option to **True**. + +``` +# Train a small java model for 1 epoch. +python code_nlm.py --data_path $DATA_HOME --train_dir $MODEL_DIR --train_filename $TRAIN_FILE --validation_filename $VALIDATION_FILE --gru True --hidden_size $STATE_DIMS --batch_size $BATCH_SIZE --word_level_perplexity True --cross_entropy True --steps_per_checkpoint $CHECKPOINT_EVERY --max_epoch $EPOCHS + +# Because we are using the default values we could shorten the above command to: +# python code_nlm.py --data_path $DATA_HOME --train_dir $MODEL_DIR --train_filename $TRAIN_FILE --validation_filename $VALIDATION_FILE --gru True --word_level_perplexity True --cross_entropy True --max_epoch $EPOCHS +``` + +## Test Scenarios +### Test Entropy Calculation +``` +# Testing the model (Calculating test set entropy) +python code_nlm.py --test True --data_path $DATA_HOME --train_dir $MODEL_DIR --test_filename $TEST_FILE --gru True --batch_size $BATCH_SIZE --word_level_perplexity True --cross_entropy True +``` + +#### Dynamically Adapt the Model on Test Data +In order to dynamically adapt the model, the implementation needs to know when it is testing on a new project, so that it can revert the model back to the global one. +This is achieved via the *test_proj_filename* option. +``` +# Batch size must always be set to 1 for this scenario! We are going through every file seperately. +# In an IDE this could instead be sped up through some engineering. +python code_nlm.py --dynamic_test True --data_path $DATA_HOME --train_dir $MODEL_DIR --test_filename $TEST_FILE --gru True --batch_size 1 --word_level_perplexity True --cross_entropy True --test_proj_filename $TEST_PROJ_NAMES_FILE --num_steps $STEPS +``` + +### Test Code Completion +In this scenario the *batch_size* option is used to set the beam size. +``` +python code_nlm.py --completion True --data_path $DATA_HOME --train_dir $MODEL_DIR --test_filename $TEST_FILE --gru True --batch_size $BATCH_SIZE +``` + +#### Dynamic Code Completion on Test Data +Similarly to before we need to set the *test_proj_filename* option. +``` +python code_nlm.py --completion True --dynamic True --data_path $DATA_HOME --train_dir $MODEL_DIR --test_filename $TEST_FILE --gru True --batch_size $BATCH_SIZE --test_proj_filename $TEST_PROJ_NAMES_FILE --num_steps $STEPS +``` + +#### Dynamic Code Completion on Test Data and Measuring Identifier Specific Performance +To run this experiment you need to provide a file containing a mapping that lets the implementation know for each subtoken whether it is part of an identifier or not. +This information would easily be present in an IDE. +The mapping is provided via the *identifier_map* option. +``` +python code_nlm.py --completion True --dynamic True --data_path $DATA_HOME --train_dir $MODEL_DIR --test_filename $TEST_FILE --gru True --batch_size $BATCH_SIZE --test_proj_filename $TEST_PROJ_NAMES_FILE --identifier_map $ID_MAP_FILE --num_steps $STEPS +``` + +#### Adding a Simple Identifier n-gram Cache +In an IDE setting we could improve the performance on identifiers by utilizing a simple n-gram cache for identifiers that we have already encountered. +The *file_cache_weight* and *cache_order* options can be used to control the cache's weight and the cache's order respectively. +By default we use a 6-gram with a weight of 0.2. +``` +python code_nlm.py --completion True --dynamic True --data_path $DATA_HOME --train_dir $MODEL_DIR --test_filename $TEST_FILE --gru True --batch_size $BATCH_SIZE --test_proj_filename $TEST_PROJ_NAMES_FILE --identifier_map $ID_MAP_FILE --cache_ids True --num_steps $STEPS +``` + +### Predictability +Similar to testing but calculates the average entropy of the files instead of the per token one. + + + +# Preprocessing + +## BPE +The BPE implementation used can be found here: https://github.com/rsennrich/subword-nmt + +To apply byte pair encoding to word segmentation, invoke these commands: +``` +subword-nmt learn-bpe -s {num_operations} < {train_file} > {codes_file} +subword-nmt apply-bpe -c {codes_file} < {test_file} > {out_file} +``` +num_operations = The number of BPE ops e.g., 10000
+train_file = The file on which to learn the encoding
+codes_file = The file in which to output the learned encoding
+test_file = The file to segment with the learned encoding
+out_file = The file in which to save the now segmented test_file
+ diff --git a/reader.py b/reader.py index 792ec76..e27bdf1 100644 --- a/reader.py +++ b/reader.py @@ -77,7 +77,7 @@ def _read_words(filename): :return: The whitespace tokenized version of the specified file. """ with tf.device('/cpu:0'): - with tf.gfile.GFile(filename, "r") as f: + with tf.io.gfile.GFile(filename, "r") as f: return f.read().decode("utf-8").strip().split() @@ -89,7 +89,7 @@ def _read_lines(filename): :return: A list of the specified file's lines. """ with tf.device('/cpu:0'): - with tf.gfile.GFile(filename, "r") as f: + with tf.io.gfile.GFile(filename, "r") as f: ret = [] for l in f: ret.append(l.decode("utf8").strip()) @@ -106,7 +106,7 @@ def _read_vocab(filename): with tf.device('/cpu:0'): word_to_id = {} id_to_word = {} - with tf.gfile.GFile(filename, "r") as ff: + with tf.io.gfile.GFile(filename, "r") as ff: for line in ff: word, iden = line.strip().split('\t') iden = int(iden) @@ -122,7 +122,7 @@ def _write_vocab(vocab, filename): :param filename: Path to the file in which the vocabulary will be saved. """ with tf.device('/cpu:0'): - with tf.gfile.GFile(filename, "w") as ff: + with tf.io.gfile.GFile(filename, "w") as ff: for w, wid in vocab.items(): ff.write(w + "\t" + str(wid) + "\n") diff --git a/reduced_requirements.txt b/reduced_requirements.txt new file mode 100644 index 0000000..15262ff --- /dev/null +++ b/reduced_requirements.txt @@ -0,0 +1,16 @@ +grpcio==1.43.0 +h5py==3.1.0 +keras==2.6.0 +Keras-Preprocessing==1.1.2 +numpy==1.19.2 +pygtrie==2.4.2 +scipy==1.7.3 +tensorboard==2.6.0 +tensorboard-data-server==0.6.0 +tensorboard-plugin-wit==1.8.1 +tensorflow==2.6.0 +tensorflow-addons==0.15.0 +tensorflow-estimator==2.6.0 +typeguard==2.13.3 +typing_extensions==3.7.4 +Werkzeug==2.0.1 diff --git a/report.txt b/report.txt new file mode 100644 index 0000000..80173a9 --- /dev/null +++ b/report.txt @@ -0,0 +1,173 @@ +TensorFlow 2.0 Upgrade Script +----------------------------- +Converted 5 files +Detected 12 issues that require attention +-------------------------------------------------------------------------------- +-------------------------------------------------------------------------------- +File: OpenVocabCodeNLM\code_nlm.py +-------------------------------------------------------------------------------- +OpenVocabCodeNLM\code_nlm.py:35:8: ERROR: Using member tf.flags in deprecated module tf.flags. tf.flags and tf.app.flags have been removed, please use the argparse or absl modules if you need command line parsing. +OpenVocabCodeNLM\code_nlm.py:137:32: WARNING: tf.nn.embedding_lookup requires manual check. `partition_strategy` has been removed from tf.nn.embedding_lookup. The 'div' strategy will be used by default. +OpenVocabCodeNLM\code_nlm.py:154:17: WARNING: Using member tf.contrib.rnn.DropoutWrapper in deprecated module tf.contrib.rnn. (Manual edit required) tf.contrib.rnn.* has been deprecated, and widely used cells/functions will be moved to tensorflow/addons repository. Please check it there and file Github issues if necessary. +OpenVocabCodeNLM\code_nlm.py:154:17: ERROR: Using member tf.contrib.rnn.DropoutWrapper in deprecated module tf.contrib. tf.contrib.rnn.DropoutWrapper cannot be converted automatically. tf.contrib will not be distributed with TensorFlow 2.0, please consider an alternative in non-contrib TensorFlow, a community-maintained repository such as tensorflow/addons, or fork the required code. +OpenVocabCodeNLM\code_nlm.py:156:17: WARNING: Using member tf.contrib.rnn.DropoutWrapper in deprecated module tf.contrib.rnn. (Manual edit required) tf.contrib.rnn.* has been deprecated, and widely used cells/functions will be moved to tensorflow/addons repository. Please check it there and file Github issues if necessary. +OpenVocabCodeNLM\code_nlm.py:156:17: ERROR: Using member tf.contrib.rnn.DropoutWrapper in deprecated module tf.contrib. tf.contrib.rnn.DropoutWrapper cannot be converted automatically. tf.contrib will not be distributed with TensorFlow 2.0, please consider an alternative in non-contrib TensorFlow, a community-maintained repository such as tensorflow/addons, or fork the required code. +OpenVocabCodeNLM\code_nlm.py:168:23: WARNING: tf.get_variable requires manual check. tf.get_variable returns ResourceVariables by default in 2.0, which have well-defined semantics and are stricter about shapes. You can disable this behavior by passing use_resource=False, or by calling tf.compat.v1.disable_resource_variables(). +OpenVocabCodeNLM\code_nlm.py:169:23: WARNING: tf.get_variable requires manual check. tf.get_variable returns ResourceVariables by default in 2.0, which have well-defined semantics and are stricter about shapes. You can disable this behavior by passing use_resource=False, or by calling tf.compat.v1.disable_resource_variables(). +OpenVocabCodeNLM\code_nlm.py:171:18: ERROR: Using member tf.contrib.legacy_seq2seq.sequence_loss_by_example in deprecated module tf.contrib. tf.contrib.legacy_seq2seq.sequence_loss_by_example cannot be converted automatically. tf.contrib will not be distributed with TensorFlow 2.0, please consider an alternative in non-contrib TensorFlow, a community-maintained repository such as tensorflow/addons, or fork the required code. +OpenVocabCodeNLM\code_nlm.py:315:8: WARNING: *.save requires manual check. (This warning is only applicable if the code saves a tf.Keras model) Keras model.save now saves to the Tensorflow SavedModel format by default, instead of HDF5. To continue saving to HDF5, add the argument save_format='h5' to the save() function. +OpenVocabCodeNLM\code_nlm.py:736:12: WARNING: *.save requires manual check. (This warning is only applicable if the code saves a tf.Keras model) Keras model.save now saves to the Tensorflow SavedModel format by default, instead of HDF5. To continue saving to HDF5, add the argument save_format='h5' to the save() function. +OpenVocabCodeNLM\code_nlm.py:1701:12: WARNING: *.save requires manual check. (This warning is only applicable if the code saves a tf.Keras model) Keras model.save now saves to the Tensorflow SavedModel format by default, instead of HDF5. To continue saving to HDF5, add the argument save_format='h5' to the save() function. +================================================================================ +Detailed log follows: + +================================================================================ +================================================================================ +Input tree: 'OpenVocabCodeNLM' +================================================================================ +-------------------------------------------------------------------------------- +Processing file 'OpenVocabCodeNLM\\code_nlm.py' + outputting to 'OpenVocabCodeNLM_V2\\code_nlm.py' +-------------------------------------------------------------------------------- + +35:8: ERROR: Using member tf.flags in deprecated module tf.flags. tf.flags and tf.app.flags have been removed, please use the argparse or absl modules if you need command line parsing. +102:12: INFO: Renamed 'tf.ConfigProto' to 'tf.compat.v1.ConfigProto' +123:9: INFO: `name` passed to `name_scope`. Because you may be re-entering an existing scope, it is not safe to convert automatically, the v2 name_scope does not support re-entering scopes by name. + +123:9: INFO: Renamed 'tf.name_scope' to 'tf.compat.v1.name_scope' +125:27: INFO: Renamed 'tf.placeholder' to 'tf.compat.v1.placeholder' +126:30: INFO: Renamed 'tf.placeholder' to 'tf.compat.v1.placeholder' +128:9: INFO: `name` passed to `name_scope`. Because you may be re-entering an existing scope, it is not safe to convert automatically, the v2 name_scope does not support re-entering scopes by name. + +128:9: INFO: Renamed 'tf.name_scope' to 'tf.compat.v1.name_scope' +129:20: INFO: Renamed 'tf.placeholder' to 'tf.compat.v1.placeholder' +130:21: INFO: Renamed 'tf.placeholder' to 'tf.compat.v1.placeholder' +131:28: INFO: Renamed 'tf.placeholder' to 'tf.compat.v1.placeholder' +134:11: INFO: `name` passed to `name_scope`. Because you may be re-entering an existing scope, it is not safe to convert automatically, the v2 name_scope does not support re-entering scopes by name. + +134:11: INFO: Renamed 'tf.name_scope' to 'tf.compat.v1.name_scope' +136:37: INFO: Renamed 'tf.random_uniform' to 'tf.random.uniform' +137:32: INFO: Added keywords to args of function 'tf.nn.embedding_lookup' +137:32: WARNING: tf.nn.embedding_lookup requires manual check. `partition_strategy` has been removed from tf.nn.embedding_lookup. The 'div' strategy will be used by default. +138:32: INFO: Changing keep_prob arg of tf.nn.dropout to rate, and recomputing value. + +140:9: INFO: `name` passed to `name_scope`. Because you may be re-entering an existing scope, it is not safe to convert automatically, the v2 name_scope does not support re-entering scopes by name. + +140:9: INFO: Renamed 'tf.name_scope' to 'tf.compat.v1.name_scope' +143:41: INFO: Renamed 'tf.contrib.rnn.BasicLSTMCell' to 'tf.compat.v1.nn.rnn_cell.BasicLSTMCell' +144:17: INFO: Renamed 'tf.contrib.rnn.BasicLSTMCell' to 'tf.compat.v1.nn.rnn_cell.BasicLSTMCell' +144:103: INFO: Renamed 'tf.get_variable_scope' to 'tf.compat.v1.get_variable_scope' +146:17: INFO: Renamed 'tf.contrib.rnn.BasicLSTMCell' to 'tf.compat.v1.nn.rnn_cell.BasicLSTMCell' +148:41: INFO: Renamed 'tf.contrib.rnn.GRUCell' to 'tf.compat.v1.nn.rnn_cell.GRUCell' +149:17: INFO: Renamed 'tf.contrib.rnn.GRUCell' to 'tf.compat.v1.nn.rnn_cell.GRUCell' +149:59: INFO: Renamed 'tf.get_variable_scope' to 'tf.compat.v1.get_variable_scope' +151:17: INFO: Renamed 'tf.contrib.rnn.GRUCell' to 'tf.compat.v1.nn.rnn_cell.GRUCell' +154:17: WARNING: Using member tf.contrib.rnn.DropoutWrapper in deprecated module tf.contrib.rnn. (Manual edit required) tf.contrib.rnn.* has been deprecated, and widely used cells/functions will be moved to tensorflow/addons repository. Please check it there and file Github issues if necessary. +154:17: ERROR: Using member tf.contrib.rnn.DropoutWrapper in deprecated module tf.contrib. tf.contrib.rnn.DropoutWrapper cannot be converted automatically. tf.contrib will not be distributed with TensorFlow 2.0, please consider an alternative in non-contrib TensorFlow, a community-maintained repository such as tensorflow/addons, or fork the required code. +156:17: WARNING: Using member tf.contrib.rnn.DropoutWrapper in deprecated module tf.contrib.rnn. (Manual edit required) tf.contrib.rnn.* has been deprecated, and widely used cells/functions will be moved to tensorflow/addons repository. Please check it there and file Github issues if necessary. +156:17: ERROR: Using member tf.contrib.rnn.DropoutWrapper in deprecated module tf.contrib. tf.contrib.rnn.DropoutWrapper cannot be converted automatically. tf.contrib will not be distributed with TensorFlow 2.0, please consider an alternative in non-contrib TensorFlow, a community-maintained repository such as tensorflow/addons, or fork the required code. +159:19: INFO: Renamed 'tf.contrib.rnn.MultiRNNCell' to 'tf.compat.v1.nn.rnn_cell.MultiRNNCell' +162:38: INFO: Renamed 'tf.nn.dynamic_rnn' to 'tf.compat.v1.nn.dynamic_rnn' +165:9: INFO: `name` passed to `name_scope`. Because you may be re-entering an existing scope, it is not safe to convert automatically, the v2 name_scope does not support re-entering scopes by name. + +165:9: INFO: Renamed 'tf.name_scope' to 'tf.compat.v1.name_scope' +168:23: WARNING: tf.get_variable requires manual check. tf.get_variable returns ResourceVariables by default in 2.0, which have well-defined semantics and are stricter about shapes. You can disable this behavior by passing use_resource=False, or by calling tf.compat.v1.disable_resource_variables(). +168:23: INFO: Renamed 'tf.get_variable' to 'tf.compat.v1.get_variable' +169:23: WARNING: tf.get_variable requires manual check. tf.get_variable returns ResourceVariables by default in 2.0, which have well-defined semantics and are stricter about shapes. You can disable this behavior by passing use_resource=False, or by calling tf.compat.v1.disable_resource_variables(). +169:23: INFO: Renamed 'tf.get_variable' to 'tf.compat.v1.get_variable' +171:18: ERROR: Using member tf.contrib.legacy_seq2seq.sequence_loss_by_example in deprecated module tf.contrib. tf.contrib.legacy_seq2seq.sequence_loss_by_example cannot be converted automatically. tf.contrib will not be distributed with TensorFlow 2.0, please consider an alternative in non-contrib TensorFlow, a community-maintained repository such as tensorflow/addons, or fork the required code. +173:18: INFO: Renamed 'tf.div' to 'tf.compat.v1.div' +173:25: INFO: Added keywords to args of function 'tf.reduce_sum' +178:9: INFO: `name` passed to `name_scope`. Because you may be re-entering an existing scope, it is not safe to convert automatically, the v2 name_scope does not support re-entering scopes by name. + +178:9: INFO: Renamed 'tf.name_scope' to 'tf.compat.v1.name_scope' +180:14: INFO: Renamed 'tf.trainable_variables' to 'tf.compat.v1.trainable_variables' +181:49: INFO: Added keywords to args of function 'tf.gradients' +183:18: INFO: Renamed 'tf.train.GradientDescentOptimizer' to 'tf.compat.v1.train.GradientDescentOptimizer' +188:6: INFO: tf.summary.scalar requires manual check. The TF 1.x summary API cannot be automatically migrated to TF 2.0, so symbols have been converted to tf.compat.v1.summary.* and must be migrated manually. Typical usage will only require changes to the summary writing logic, not to individual calls like scalar(). For examples of the new summary API, see the Effective TF 2.0 migration document or check the TF 2.0 TensorBoard tutorials. +188:6: INFO: Renamed 'tf.summary.scalar' to 'tf.compat.v1.summary.scalar' +191:6: INFO: tf.summary.scalar requires manual check. The TF 1.x summary API cannot be automatically migrated to TF 2.0, so symbols have been converted to tf.compat.v1.summary.* and must be migrated manually. Typical usage will only require changes to the summary writing logic, not to individual calls like scalar(). For examples of the new summary API, see the Effective TF 2.0 migration document or check the TF 2.0 TensorBoard tutorials. +191:6: INFO: Renamed 'tf.summary.scalar' to 'tf.compat.v1.summary.scalar' +193:19: INFO: Renamed 'tf.train.Saver' to 'tf.compat.v1.train.Saver' +193:34: INFO: Renamed 'tf.global_variables' to 'tf.compat.v1.global_variables' +194:24: INFO: Renamed 'tf.initialize_all_variables' to 'tf.compat.v1.initialize_all_variables' +195:21: INFO: tf.summary.merge_all requires manual check. The TF 1.x summary API cannot be automatically migrated to TF 2.0, so symbols have been converted to tf.compat.v1.summary.* and must be migrated manually. Typical usage will only require changes to the summary writing logic, not to individual calls like scalar(). For examples of the new summary API, see the Effective TF 2.0 migration document or check the TF 2.0 TensorBoard tutorials. +195:21: INFO: Renamed 'tf.summary.merge_all' to 'tf.compat.v1.summary.merge_all' +204:13: INFO: Renamed 'tf.trainable_variables' to 'tf.compat.v1.trainable_variables' +265:21: INFO: tf.summary.FileWriter requires manual check. The TF 1.x summary API cannot be automatically migrated to TF 2.0, so symbols have been converted to tf.compat.v1.summary.* and must be migrated manually. Typical usage will only require changes to the summary writing logic, not to individual calls like scalar(). For examples of the new summary API, see the Effective TF 2.0 migration document or check the TF 2.0 TensorBoard tutorials. +265:21: INFO: Renamed 'tf.summary.FileWriter' to 'tf.compat.v1.summary.FileWriter' +315:8: WARNING: *.save requires manual check. (This warning is only applicable if the code saves a tf.Keras model) Keras model.save now saves to the Tensorflow SavedModel format by default, instead of HDF5. To continue saving to HDF5, add the argument save_format='h5' to the save() function. +317:35: INFO: Renamed 'tf.Summary' to 'tf.compat.v1.Summary' +318:35: INFO: Renamed 'tf.Summary' to 'tf.compat.v1.Summary' +439:20: INFO: Renamed 'tf.train.checkpoint_exists' to 'tf.compat.v1.train.checkpoint_exists' +538:20: INFO: Renamed 'tf.train.checkpoint_exists' to 'tf.compat.v1.train.checkpoint_exists' +642:11: INFO: Renamed 'tf.train.checkpoint_exists' to 'tf.compat.v1.train.checkpoint_exists' +695:17: INFO: Renamed 'tf.train.checkpoint_exists' to 'tf.compat.v1.train.checkpoint_exists' +736:12: WARNING: *.save requires manual check. (This warning is only applicable if the code saves a tf.Keras model) Keras model.save now saves to the Tensorflow SavedModel format by default, instead of HDF5. To continue saving to HDF5, add the argument save_format='h5' to the save() function. +744:20: INFO: Renamed 'tf.train.checkpoint_exists' to 'tf.compat.v1.train.checkpoint_exists' +1010:22: INFO: Renamed 'tf.train.checkpoint_exists' to 'tf.compat.v1.train.checkpoint_exists' +1609:11: INFO: Renamed 'tf.train.checkpoint_exists' to 'tf.compat.v1.train.checkpoint_exists' +1661:17: INFO: Renamed 'tf.train.checkpoint_exists' to 'tf.compat.v1.train.checkpoint_exists' +1701:12: WARNING: *.save requires manual check. (This warning is only applicable if the code saves a tf.Keras model) Keras model.save now saves to the Tensorflow SavedModel format by default, instead of HDF5. To continue saving to HDF5, add the argument save_format='h5' to the save() function. +1712:20: INFO: Renamed 'tf.train.checkpoint_exists' to 'tf.compat.v1.train.checkpoint_exists' +2054:9: INFO: Renamed 'tf.Session' to 'tf.compat.v1.Session' +2069:14: INFO: Renamed 'tf.train.checkpoint_exists' to 'tf.compat.v1.train.checkpoint_exists' +2074:16: INFO: Renamed 'tf.global_variables_initializer' to 'tf.compat.v1.global_variables_initializer' +2124:13: INFO: Renamed 'tf.Session' to 'tf.compat.v1.Session' +2148:13: INFO: Renamed 'tf.Session' to 'tf.compat.v1.Session' +2173:13: INFO: Renamed 'tf.Session' to 'tf.compat.v1.Session' +2204:13: INFO: Renamed 'tf.Session' to 'tf.compat.v1.Session' +2243:13: INFO: Renamed 'tf.Session' to 'tf.compat.v1.Session' +2268:13: INFO: Renamed 'tf.Session' to 'tf.compat.v1.Session' +2331:4: INFO: Renamed 'tf.app.run' to 'tf.compat.v1.app.run' +-------------------------------------------------------------------------------- + +-------------------------------------------------------------------------------- +Processing file 'OpenVocabCodeNLM\\create_subtoken_data.py' + outputting to 'OpenVocabCodeNLM_V2\\create_subtoken_data.py' +-------------------------------------------------------------------------------- + + +-------------------------------------------------------------------------------- + +-------------------------------------------------------------------------------- +Processing file 'OpenVocabCodeNLM\\non-ascii_sequences_to_unk.py' + outputting to 'OpenVocabCodeNLM_V2\\non-ascii_sequences_to_unk.py' +-------------------------------------------------------------------------------- + +ERROR: Failed to parse. +Traceback (most recent call last): + File "C:\Users\Leonh\anaconda3\envs\OpenVocabCodeNLM_Bumped\lib\site-packages\tensorflow\tools\compatibility\ast_edits.py", line 940, in update_string_pasta + t = pasta.parse(text) + File "C:\Users\Leonh\anaconda3\envs\OpenVocabCodeNLM_Bumped\lib\site-packages\pasta\__init__.py", line 23, in parse + t = ast_utils.parse(src) + File "C:\Users\Leonh\anaconda3\envs\OpenVocabCodeNLM_Bumped\lib\site-packages\pasta\base\ast_utils.py", line 56, in parse + tree = ast.parse(sanitize_source(src)) + File "C:\Users\Leonh\anaconda3\envs\OpenVocabCodeNLM_Bumped\lib\ast.py", line 50, in parse + return compile(source, filename, mode, flags, + File "", line 24 + print 'Usage non-ascii_sequences_to_unk.py source_file destination_file' + ^ +SyntaxError: Missing parentheses in call to 'print'. Did you mean print('Usage non-ascii_sequences_to_unk.py source_file destination_file')? + +-------------------------------------------------------------------------------- + +-------------------------------------------------------------------------------- +Processing file 'OpenVocabCodeNLM\\reader.py' + outputting to 'OpenVocabCodeNLM_V2\\reader.py' +-------------------------------------------------------------------------------- + +80:9: INFO: Renamed 'tf.gfile.GFile' to 'tf.io.gfile.GFile' +92:9: INFO: Renamed 'tf.gfile.GFile' to 'tf.io.gfile.GFile' +109:9: INFO: Renamed 'tf.gfile.GFile' to 'tf.io.gfile.GFile' +125:9: INFO: Renamed 'tf.gfile.GFile' to 'tf.io.gfile.GFile' +-------------------------------------------------------------------------------- + +-------------------------------------------------------------------------------- +Processing file 'OpenVocabCodeNLM\\util\\identifier_mapping.py' + outputting to 'OpenVocabCodeNLM_V2\\util\\identifier_mapping.py' +-------------------------------------------------------------------------------- + + +-------------------------------------------------------------------------------- + diff --git a/requirements.txt b/requirements.txt index 685e912..0abe29c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,67 @@ -numpy==1.18.1 -pygtrie==2.3.2 -tensorflow-gpu==1.12.3 +absl-py==1.0.0 +aiohttp==3.8.1 +aiosignal==1.2.0 +astor==0.8.1 +astunparse==1.6.3 +async-timeout==4.0.2 +attrs==21.4.0 +blinker==1.4 +brotlipy==0.7.0 +cached-property==1.5.2 +cachetools==4.2.4 +certifi==2021.10.8 +cffi==1.15.0 +charset-normalizer==2.0.10 +click==8.0.3 +colorama==0.4.4 +cryptography==36.0.1 +dataclasses==0.8 +flatbuffers==1.12 +frozenlist==1.2.0 +gast==0.4.0 +google-auth==1.35.0 +google-auth-oauthlib==0.4.6 +google-pasta==0.2.0 +grpcio==1.43.0 +h5py==3.6.0 +idna==3.1 +importlib-metadata==4.10.0 +keras==2.6.0 +Keras-Preprocessing==1.1.2 +Markdown==3.3.6 +multidict==5.2.0 +numpy==1.22.0 +oauthlib==3.1.1 +opt-einsum==3.3.0 +pip==21.3.1 +protobuf==3.17.2 +pyasn1==0.4.8 +pyasn1-modules==0.2.7 +pycparser==2.21 +pygtrie==2.4.2 +PyJWT==2.3.0 +pyOpenSSL==21.0.0 +PySocks==1.7.1 +pyu2f==0.1.5 +requests==2.27.1 +requests-oauthlib==1.3.0 +rsa==4.8 +scipy==1.7.3 +setuptools==60.5.0 +six==1.16.0 +tensorboard==2.6.0 +tensorboard-data-server==0.6.0 +tensorboard-plugin-wit==1.8.1 +tensorflow==2.6.0 +tensorflow-addons==0.15.0 +tensorflow-estimator==2.6.0 +termcolor==1.1.0 +typeguard==2.13.3 +typing_extensions==4.0.1 +urllib3==1.26.8 +Werkzeug==2.0.1 +wheel==0.35.1 +win-inet-pton==1.1.0 +wrapt==1.13.3 +yarl==1.7.2 +zipp==3.7.0