-
Notifications
You must be signed in to change notification settings - Fork 0
/
essay scoring_update.ipnyb
1 lines (1 loc) · 31.4 KB
/
essay scoring_update.ipnyb
1
{"cells":[{"cell_type":"code","execution_count":null,"metadata":{"id":"8cgF4NVgFiIx"},"outputs":[],"source":["import pandas as pd"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":10,"status":"ok","timestamp":1657178452527,"user":{"displayName":"백서윤","userId":"07287278861098105491"},"user_tz":-540},"id":"oEKffWV4Keng","outputId":"1f4bd640-cbb0-4339-e927-8cc032c72630"},"outputs":[{"output_type":"stream","name":"stderr","text":["[nltk_data] Downloading package stopwords to /root/nltk_data...\n","[nltk_data] Package stopwords is already up-to-date!\n"]},{"output_type":"execute_result","data":{"text/plain":["True"]},"metadata":{},"execution_count":68}],"source":["import nltk\n","from nltk.corpus import stopwords\n","nltk.download('stopwords')"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":5181,"status":"ok","timestamp":1657178461455,"user":{"displayName":"백서윤","userId":"07287278861098105491"},"user_tz":-540},"id":"ueKhjuhKQCFh","outputId":"2e995c3e-a2ba-4ecc-cd1f-8245b6b5a3e6"},"outputs":[{"output_type":"stream","name":"stdout","text":["Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n"]}],"source":["from google.colab import drive\n","drive.mount('/content/drive')"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":1274,"status":"ok","timestamp":1657178463919,"user":{"displayName":"백서윤","userId":"07287278861098105491"},"user_tz":-540},"id":"TgOdFNfoLEXg","outputId":"f99d5880-7541-4c68-9734-f21e98a385fa"},"outputs":[{"output_type":"stream","name":"stderr","text":["/usr/local/lib/python3.7/dist-packages/IPython/core/interactiveshell.py:2882: FutureWarning: The error_bad_lines argument has been deprecated and will be removed in a future version.\n","\n","\n"," exec(code_obj, self.user_global_ns, self.user_ns)\n"]}],"source":["# Data loading\n","test_data = pd.read_csv(\"/content/drive/MyDrive/Automated Debate Scoring/test_set.tsv\",sep='\\t', encoding='ISO-8859-1', quoting=3, error_bad_lines=False)\n","valid_data = pd.read_csv(\"/content/drive/MyDrive/Automated Debate Scoring/valid_set.tsv\",sep='\\t', encoding='ISO-8859-1')\n","train_data = pd.read_csv(\"/content/drive/MyDrive/Automated Debate Scoring/training_set_rel3.tsv\", sep='\\t', encoding = 'ISO-8859-1', quoting=3, error_bad_lines=False)"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"GVZVjmLRaH0o"},"outputs":[],"source":["test_data.dropna(axis=1,inplace=True)\n","valid_data.dropna(axis=1,inplace=True)"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":680,"status":"ok","timestamp":1657178467548,"user":{"displayName":"백서윤","userId":"07287278861098105491"},"user_tz":-540},"id":"5T74HqeQLHoF","outputId":"ed847061-2a38-46a9-e189-f5c8bf2c645c"},"outputs":[{"output_type":"execute_result","data":{"text/plain":["0 8\n","1 9\n","2 7\n","3 10\n","13 6\n","15 12\n","18 4\n","21 3\n","23 11\n","40 2\n","49 5\n","1784 1\n","3590 0\n","10686 15\n","10687 13\n","10689 17\n","10691 23\n","10692 16\n","10693 18\n","10697 19\n","10699 14\n","10704 21\n","10707 24\n","10722 20\n","10752 22\n","12255 34\n","12256 46\n","12257 40\n","12258 30\n","12259 26\n","12263 41\n","12265 31\n","12266 44\n","12268 36\n","12269 43\n","12271 45\n","12274 35\n","12276 42\n","12278 33\n","12279 38\n","12287 47\n","12290 32\n","12297 50\n","12298 39\n","12307 37\n","12337 55\n","12342 60\n","12344 28\n","12345 49\n","12347 29\n","12352 27\n","12507 25\n","12526 48\n","Name: domain1_score, dtype: int64"]},"metadata":{},"execution_count":72}],"source":["# Setting data\n","X_data = train_data['essay']\n","Y_data = train_data['domain1_score']\n","\n","X_data.drop_duplicates()\n","Y_data.drop_duplicates()"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"iavYPoivbWSq"},"outputs":[],"source":["def essay_to_wordlist(essay_v, remove_stopwords):\n"," #Remove the tagged labels and word tokenize the sentence.\n"," essay_v = re.sub(\"[^a-zA-Z]\", \" \", essay_v)\n"," words = essay_v.lower().split()\n"," if remove_stopwords:\n"," stops = set(stopwords.words(\"english\"))\n"," words = [w for w in words if not w in stops]\n"," return (words)\n","\n","def essay_to_sentences(essay_v, remove_stopwords):\n"," \"\"\"Sentence tokenize the essay and call essay_to_wordlist() for word tokenization.\"\"\"\n"," tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')\n"," raw_sentences = tokenizer.tokenize(essay_v.strip())\n"," sentences = []\n"," for raw_sentence in raw_sentences:\n"," if len(raw_sentence) > 0:\n"," sentences.append(essay_to_wordlist(raw_sentence, remove_stopwords))\n"," return sentences\n","\n","def makeFeatureVec(words, model, num_features):\n"," \"\"\"Make ar from the words list of an Essay.\"\"\"\n"," featureVec = np.zeros((num_features,),dtype=\"float32\")\n"," num_words = 0.\n"," index2word_set = set(model.wv.index2word)\n"," for word in words:\n"," if word in index2word_set:\n"," num_words += 1\n"," featureVec = np.add(featureVec,model[word]) \n"," featureVec = np.divide(featureVec,num_words)\n"," return featureVec\n","\n","def getAvgFeatureVecs(essays, model, num_features):\n"," \"\"\"Main function to generate the word vectors for word2vec model.\"\"\"\n"," counter = 0\n"," essayFeatureVecs = np.zeros((len(essays),num_features),dtype=\"float32\")\n"," for essay in essays:\n"," essayFeatureVecs[counter] = makeFeatureVec(essay, model, num_features)\n"," counter = counter + 1\n"," return essayFeatureVecs"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"pwm1B5NMcw0h"},"outputs":[],"source":["from sklearn.model_selection import train_test_split\n","X_train, X_test, Y_train, Y_test = train_test_split(X_data, Y_data, test_size = 0.2, random_state = 38)"]},{"cell_type":"code","source":["# X_test.sort_index() 후 첫 인덱스를 토론 스크립트로 대체\n","\n","X_test.sort_index()\n","X_test[3] = \"First of all, I would like to start off with the revolution from the government and then move on to our two main arguments. To start off with the revolution, we agree with the goal of preserving well being of the child. However, this well being does not not only need the physical wellbeing, but it also includes mental wellbeing. And in their side we can not protect the child's mental wellbeing as much as our as I can. And also regarding what the religious cases that they're talking about. Some religious might think that their religion belief and stuff like that is like the main value and things. But in our modern society, these kind of cases where they do degrees of those actions is actually considered as child abuse. And in case of this is affecting their lives directly along with their life. And if they are actually killing people because of their religious belief of their parents, then we believe that this could be considered as child abuse cases and be treated legally so they can get the help that they need. They also talked about about the justification of supporting the legal guardianship existing appearance in the Santa's Pro. There are lots of people who discuss about who to have these legal guardianship and the reason that they decided the people who have their legal guardianship is actually their guardians and parents. We believe there is some sort of reason behind it and we will explain that further on in our arguments. To start off with, before we start off with the argument, we believe that there are two characteristics of a child. First is irrationality and the second point, which is most important is that they have their life that's about 80 or 90 years in their life. So we have to consider the amount of life that they're going to live further and think of like the best way for them to live a very wonderful life in the future. So, first argument is not the doubts of qualification. What aspect should be considered an irrational favorable decision for the child? First, we think we have to consider the medical information. The information itself is not mutually exclusive to health care experts as our due to the leader of opposition has talked about. They might have more changes that they experience, they might have more examples that they face throughout their medical life. But however, that does not guarantee 100% chance of them signing the right way because in these kind of life threatening situations, these half half where these chances are normally really dangerous. So even if they decide the best way that they can think half, it does not guarantee a perfectly great life for them and it can actually lead to death as well. So we believe that in whatever medical information or medical knowledge that they have, they do not guarantee 100% life chance for their child. And because this life is regarding their parents rather than the medical professionals themselves. We believe that we have to give a chance for the parents to decide what kind of medical procedure and what kind of things that they have to care about to talk about the second case, which is about understanding of the child's will. We believe this is the most important aspect in deciding what kind of mental treatment to do in somebody clients. So why is understanding the child's importance? Because in life certainly conditions it's only something like child cancer or zukomia or something like that, a very serious and serious disease. So in these kinds of cases, how the child feels due to each procedure is really important because it directly connects to the health and quality of life for long periods. So this isn't just over by one single surgery or some sort of like true surgery. It can take more than two years or three years and stuff like that. And normal average time period of these kind of diseases actually take two to three years. So when you're deciding something about two to three years of this entire child's life, then we believe that this child does not really want to do some sort of this procedure, or they don't feel comfortable doing this kind of medical procedures, then we have to respect their decision and their rights as well. And in the other cases about medical emergency where child's tense, they don't have any kind of consciousness, they have some sort of serious problem that they can't really decide anything. We believe throughout the life that their parents have lived with this child and throughout the times that they work with them. We believe that they can actually know directly on what kind of thoughts and values that this child has and this can be decided by their parents. Then why sometimes experts know these kind of child will. We believe this is because of the aspect that child can communicate more comfortably with their parents compared to the medical experts. The time the medical experts see the child is actually about two to three minutes per day. If they're in some sort of situation or if they're in the hospital themselves, they don't actually see the doctors more than 1 hour or longer like that. So we believe it's not enough time for the health expert to know with this child really? No thank you, what's this procedure? However, in the case of guardians and parents, they are actually with them 24/7 mostly in these severe cases and they would have lived with this child for a certain amount of time. So we believe that they have a lot of knowledge regarding what the child really wants and those kind of aspects can lead to better decisions of their parents themselves. So we believe that the amount of information that the parents has compared to the medical information about their child is actually better in the case of their parents. Second, to talk about the benefits for the futures of child's life. We believe that because child actually has 80 90 years of child life left, we have to consider this really directly. In the case of parents, if they talk to above, parents inherently care the very best for their child, and if not, if they don't care about their child, that is trails and incelporately. So this is something that you're going to take it legally. It's not like something that we have to discuss. And in the case of health care at Frisky, they might only have great intentions and they might have better quality of life and stuff like that. But however, that does not mean that they can always make the best decision, as I said before. So we are the toughest in our later argument and also when we talk about the best decision of making the future life of patients. Life decisions actually normally have lots of side effects that can cause from the disease themselves. So in the case of leukemia, when they do this like this medical procedure from their brain taking out the liquid that's in the brain, if they take out the liquid, then there is a percentage of those child living a much longer life. But in this case, this child might get paralyzed. And those sort of side effects do exist in these kind of situations. So we believe we have to consider the parents opinion and what they want to choose. Whether they want to choose like paralyzed child or some sort of other side effects should be totally upon the parents, not the medical expert themselves. Because medical experts will not live with this child forever. But in the case apparently you have to live with them for another ten or 20 years. So we believe it's important to consider the family conditions and their situation because not every single family might be able to live with those child conditions because of their special reasons. And to talk about the second argument, we believe it's going to be too much better for medical experts in the case of the situation. In the case of doctors, yes, they might have great intentions as they talked about. However, doctors don't only meet like one or two patients a day, they have to meet if they're really famous, they have to meet about 100 patients a day. And that does not mean that they have enough time and think about each patient's best medical cases and each patient's great way to solve their health problems and stuff like that. So we will make these kind of value decisions when it is not about just living and dying and when it's something about deciding what kind of side effects they want and then something that we have to consider about the family issues and this child's will as well. We believe doctors can't make the best decisions for themselves and we believe that it's going to be a lot of work for the doctors because patients themselves can actually blame the doctors as well when something goes wrong and they can suffer from those kinds of extreme mental stress and stuff like that. So we believe that parents should have the legal bargainship towards these medical cases.\"\n","X_test[6] = \"For many years, I have been interested in studying international relations. My interest in pursuing this field stems from several factors which have affected me. First, I have been exposed to international affairs throughout my life. With my father and two of my brothers in the Saudi Foreign Service, I have grown up under the shadow of inter-national affairs. Second, I am fascinated by history, economics, and diplomacy. I believe, through the study of international relations, I can effectively satisfy my curiosity in these fields. A third factor which has affected my interest in international relations is patriotism. Through the Foreign Service, I would not only have the opportunity to serve my country, but also have the chance to help bridge gaps between my country and others. Finally, as a Saudi living abroad, I have been bridging cultures throughout my life. This experience has taught me to look for differences to compromise and similarities to synthesize in order to balance different cultures. In short, I believe that my experiences in life, combined with a rigorous academic education, will enable me to pursue a successful career in the Saudi Foreign Service.\""],"metadata":{"id":"efB0xKUClUCi"},"execution_count":null,"outputs":[]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":50383,"status":"ok","timestamp":1657178551618,"user":{"displayName":"백서윤","userId":"07287278861098105491"},"user_tz":-540},"id":"D2DDDhWxdPOC","outputId":"57299ae3-01ed-4d07-afd1-2857a4fffaed"},"outputs":[{"output_type":"stream","name":"stderr","text":["[nltk_data] Downloading package punkt to /root/nltk_data...\n","[nltk_data] Package punkt is already up-to-date!\n"]},{"output_type":"stream","name":"stdout","text":["Training Word2Vec Model...\n"]},{"output_type":"stream","name":"stderr","text":["/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:28: DeprecationWarning: Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).\n"]}],"source":["from gensim.models import Word2Vec\n","import numpy as np\n","import nltk\n","import re\n","nltk.download('punkt')\n","\n","sentences = []\n","for essay in X_train:\n"," # Obtaining all sentences from the training essays.\n"," sentences += essay_to_sentences(essay, remove_stopwords = True)\n","\n","# Initializing variables for word2vec model.\n","num_features = 300\n","min_word_count = 40\n","num_workers = 8\n","context = 10\n","downsampling = 1e-3\n","\n","print(\"Training Word2Vec Model...\")\n","model = Word2Vec(sentences, workers=num_workers, size=num_features, min_count = min_word_count, window = context, sample = downsampling)\n","\n","model.init_sims(replace=True)\n","model.wv.save_word2vec_format('word2vecmodel.bin', binary=True)\n","\n","clean_train_essays = [] \n","# Generate training and testing data word vectors.\n","for essay_v_train in X_train:\n"," clean_train_essays.append(essay_to_wordlist(essay_v_train, remove_stopwords=True))\n","trainDataVecs = getAvgFeatureVecs(clean_train_essays, model, num_features)\n"," \n","clean_test_essays = []\n","for essay_v_test in X_test:\n"," clean_test_essays.append(essay_to_wordlist(essay_v_test, remove_stopwords=True ))\n"," \n","testDataVecs = getAvgFeatureVecs( clean_test_essays, model, num_features )\n","trainDataVecs = np.array(trainDataVecs)\n","testDataVecs = np.array(testDataVecs)"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"2b3TcANnu9aq"},"outputs":[],"source":["from keras.layers import Embedding, LSTM, Dense, Dropout, Lambda, Flatten\n","from keras.models import Sequential, load_model, model_from_config\n","import keras.backend as K\n","\n","def get_model():\n"," \"\"\"Define the model.\"\"\"\n"," model = Sequential()\n"," model.add(LSTM(300, dropout=0.4, recurrent_dropout=0.4, input_shape=[1, 300], return_sequences=True))\n"," model.add(LSTM(64, recurrent_dropout=0.4))\n"," model.add(Dropout(0.5))\n"," model.add(Dense(1, activation='relu'))\n","\n"," model.compile(loss='mean_squared_error', optimizer='rmsprop', metrics=['mae'])\n"," model.summary()\n","\n"," return model"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"oM-kumOCvwro","executionInfo":{"status":"ok","timestamp":1657178875306,"user_tz":-540,"elapsed":318763,"user":{"displayName":"백서윤","userId":"07287278861098105491"}},"outputId":"d9426c2f-2e38-4113-f9f8-581007b2bbf0"},"outputs":[{"output_type":"stream","name":"stdout","text":["\n","--------Fold 1--------\n","\n","Training Word2Vec Model...\n"]},{"output_type":"stream","name":"stderr","text":["/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:28: DeprecationWarning: Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).\n"]},{"output_type":"stream","name":"stdout","text":["Model: \"sequential_7\"\n","_________________________________________________________________\n"," Layer (type) Output Shape Param # \n","=================================================================\n"," lstm_14 (LSTM) (None, 1, 300) 721200 \n"," \n"," lstm_15 (LSTM) (None, 64) 93440 \n"," \n"," dropout_7 (Dropout) (None, 64) 0 \n"," \n"," dense_7 (Dense) (None, 1) 65 \n"," \n","=================================================================\n","Total params: 814,705\n","Trainable params: 814,705\n","Non-trainable params: 0\n","_________________________________________________________________\n","Epoch 1/2\n","163/163 [==============================] - 10s 28ms/step - loss: 60.9651 - mae: 4.1140\n","Epoch 2/2\n","163/163 [==============================] - 5s 28ms/step - loss: 34.7050 - mae: 3.3115\n","Kappa Score: 0.7629660162831869\n","\n","--------Fold 2--------\n","\n","Training Word2Vec Model...\n"]},{"output_type":"stream","name":"stderr","text":["/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:28: DeprecationWarning: Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).\n"]},{"output_type":"stream","name":"stdout","text":["Model: \"sequential_8\"\n","_________________________________________________________________\n"," Layer (type) Output Shape Param # \n","=================================================================\n"," lstm_16 (LSTM) (None, 1, 300) 721200 \n"," \n"," lstm_17 (LSTM) (None, 64) 93440 \n"," \n"," dropout_8 (Dropout) (None, 64) 0 \n"," \n"," dense_8 (Dense) (None, 1) 65 \n"," \n","=================================================================\n","Total params: 814,705\n","Trainable params: 814,705\n","Non-trainable params: 0\n","_________________________________________________________________\n","Epoch 1/2\n","163/163 [==============================] - 10s 29ms/step - loss: 60.5122 - mae: 4.0996\n","Epoch 2/2\n","163/163 [==============================] - 5s 31ms/step - loss: 34.2653 - mae: 3.3150\n","Kappa Score: 0.7582421854152035\n","\n","--------Fold 3--------\n","\n","Training Word2Vec Model...\n"]},{"output_type":"stream","name":"stderr","text":["/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:28: DeprecationWarning: Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).\n"]},{"output_type":"stream","name":"stdout","text":["Model: \"sequential_9\"\n","_________________________________________________________________\n"," Layer (type) Output Shape Param # \n","=================================================================\n"," lstm_18 (LSTM) (None, 1, 300) 721200 \n"," \n"," lstm_19 (LSTM) (None, 64) 93440 \n"," \n"," dropout_9 (Dropout) (None, 64) 0 \n"," \n"," dense_9 (Dense) (None, 1) 65 \n"," \n","=================================================================\n","Total params: 814,705\n","Trainable params: 814,705\n","Non-trainable params: 0\n","_________________________________________________________________\n","Epoch 1/2\n","163/163 [==============================] - 10s 29ms/step - loss: 59.9468 - mae: 4.0946\n","Epoch 2/2\n","163/163 [==============================] - 5s 28ms/step - loss: 33.8278 - mae: 3.2873\n","Kappa Score: 0.7695912382931149\n","\n","--------Fold 4--------\n","\n","Training Word2Vec Model...\n"]},{"output_type":"stream","name":"stderr","text":["/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:28: DeprecationWarning: Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).\n"]},{"output_type":"stream","name":"stdout","text":["Model: \"sequential_10\"\n","_________________________________________________________________\n"," Layer (type) Output Shape Param # \n","=================================================================\n"," lstm_20 (LSTM) (None, 1, 300) 721200 \n"," \n"," lstm_21 (LSTM) (None, 64) 93440 \n"," \n"," dropout_10 (Dropout) (None, 64) 0 \n"," \n"," dense_10 (Dense) (None, 1) 65 \n"," \n","=================================================================\n","Total params: 814,705\n","Trainable params: 814,705\n","Non-trainable params: 0\n","_________________________________________________________________\n","Epoch 1/2\n","163/163 [==============================] - 10s 28ms/step - loss: 62.1871 - mae: 4.1692\n","Epoch 2/2\n","163/163 [==============================] - 5s 28ms/step - loss: 34.6524 - mae: 3.2963\n","Kappa Score: 0.7635937730296873\n","\n","--------Fold 5--------\n","\n","Training Word2Vec Model...\n"]},{"output_type":"stream","name":"stderr","text":["/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:28: DeprecationWarning: Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).\n"]},{"output_type":"stream","name":"stdout","text":["Model: \"sequential_11\"\n","_________________________________________________________________\n"," Layer (type) Output Shape Param # \n","=================================================================\n"," lstm_22 (LSTM) (None, 1, 300) 721200 \n"," \n"," lstm_23 (LSTM) (None, 64) 93440 \n"," \n"," dropout_11 (Dropout) (None, 64) 0 \n"," \n"," dense_11 (Dense) (None, 1) 65 \n"," \n","=================================================================\n","Total params: 814,705\n","Trainable params: 814,705\n","Non-trainable params: 0\n","_________________________________________________________________\n","Epoch 1/2\n","163/163 [==============================] - 10s 29ms/step - loss: 63.6032 - mae: 4.2917\n","Epoch 2/2\n","163/163 [==============================] - 5s 28ms/step - loss: 36.1515 - mae: 3.4327\n","Kappa Score: 0.7618365224901404\n"]}],"source":["from sklearn.model_selection import KFold\n","from sklearn.metrics import cohen_kappa_score\n","\n","cv = KFold(n_splits = 5, shuffle = True)\n","results = []\n","y_pred_list = []\n","\n","count = 1\n","for traincv, testcv in cv.split(X_data):\n"," print(\"\\n--------Fold {}--------\\n\".format(count))\n"," \n"," train_essays = X_train\n"," test_essays = X_test\n"," \n"," sentences = []\n"," \n"," for essay in train_essays:\n"," # Obtaining all sentences from the training essays.\n"," sentences += essay_to_sentences(essay, remove_stopwords = True)\n"," \n"," # Initializing variables for word2vec model.\n"," num_features = 300\n"," min_word_count = 40\n"," num_workers = 8\n"," context = 10\n"," downsampling = 1e-3\n","\n"," print(\"Training Word2Vec Model...\")\n"," model = Word2Vec(sentences, workers=num_workers, size=num_features, min_count = min_word_count, window = context, sample = downsampling)\n","\n"," model.init_sims(replace=True)\n"," model.wv.save_word2vec_format('word2vecmodel.bin', binary=True)\n","\n"," clean_train_essays = []\n"," # Generate training and testing data word vectors.\n"," for essay_v in train_essays:\n"," clean_train_essays.append(essay_to_wordlist(essay_v, remove_stopwords=True))\n"," trainDataVecs = getAvgFeatureVecs(clean_train_essays, model, num_features)\n"," \n"," clean_test_essays = []\n"," for essay_v in test_essays:\n"," clean_test_essays.append(essay_to_wordlist( essay_v, remove_stopwords=True ))\n"," testDataVecs = getAvgFeatureVecs( clean_test_essays, model, num_features )\n","\n"," # Reshaping train and test vectors to 3 dimensions. (1 represnts one timestep) \n"," trainDataVecs = np.array(trainDataVecs)\n"," testDataVecs = np.array(testDataVecs)\n"," trainDataVecs = np.reshape(trainDataVecs, (trainDataVecs.shape[0], 1, trainDataVecs.shape[1]))\n"," testDataVecs = np.reshape(testDataVecs, (testDataVecs.shape[0], 1, testDataVecs.shape[1]))\n"," \n"," lstm_model = get_model()\n"," lstm_model.fit(trainDataVecs, Y_train, batch_size=64, epochs=2)\n"," #lstm_model.load_weights('./model_weights/final_lstm.h5')\n"," y_pred = lstm_model.predict(testDataVecs)\n"," \n"," # Save any one of the 5 models.\n"," if count == 5:\n"," lstm_model.save_weights('final_lstm.h5')\n"," \n"," # Round y_pred to the nearest integer.\n"," y_pred = np.around(y_pred)\n"," \n"," # Evaluate the model on the evaluation metric. \"Quadratic mean averaged Kappa\"\n"," result = cohen_kappa_score(Y_test.values,y_pred,weights='quadratic')\n"," print(\"Kappa Score: {}\".format(result))\n"," results.append(result)\n","\n"," count += 1"]},{"cell_type":"code","source":["def winner(A,B):\n"," if A>B:\n"," print(\"Team A won\")\n"," elif A==B:\n"," print(\"Draw\")\n"," else:\n"," print(\"Team B won\")"],"metadata":{"id":"ZRnnOF5ngads"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["prediction = lstm_model.predict(testDataVecs)\n","teamA = prediction[0]\n","teamB = prediction[1]\n","winner(teamA, teamB)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"0R9jWxOZgmgb","executionInfo":{"status":"ok","timestamp":1657178891110,"user_tz":-540,"elapsed":1473,"user":{"displayName":"백서윤","userId":"07287278861098105491"}},"outputId":"4423f0a0-3b8b-4023-f200-534466c9893c"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["Team B won\n"]}]}],"metadata":{"colab":{"collapsed_sections":[],"name":"essay scoring_update.ipnyb","provenance":[{"file_id":"1C7awYuRoXH4XekQGbCgz-E3Pka9Td353","timestamp":1657163837227},{"file_id":"1s8ucXJ36wz8FaC_BZsHDEoOsQ8_PT5b4","timestamp":1656853407616}]},"kernelspec":{"display_name":"Python 3","name":"python3"},"language_info":{"name":"python"}},"nbformat":4,"nbformat_minor":0}