GatorEducator · favourojo · Mar 30, 2021 · Mar 30, 2021 · Mar 31, 2021 · Mar 31, 2021
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -21,8 +21,8 @@ jobs:
       uses: dschep/install-pipenv-action@v1
     - name: Install dependencies
       run: |
-        # install dependencies according to the lock file
-        pipenv install --dev --ignore-pipfile --python ${{ steps.setup-python.outputs.python-version }}
+        # install dependencies according to the pip file
+        pipenv install --dev --skip-lock --python ${{ steps.setup-python.outputs.python-version }}
         pipenv run python -m spacy download en_core_web_sm
     - name: Run test with pytest
       run: |

diff --git a/Pipfile b/Pipfile
@@ -25,6 +25,7 @@ scipy = "*"
 pylint = "*"
 importlib-metadata = "*"
 atomicwrites = "*"
+wordcloud = "*"
 
 [pipenv]
 allow_prereleases = true
diff --git a/Pipfile.lock b/Pipfile.lock
diff --git a/src/analyzer.py b/src/analyzer.py
@@ -1,14 +1,17 @@
 """Text Proprocessing"""
 from collections import Counter
+
+from . import markdown as md
+
 from textblob import TextBlob
 import pandas as pd
+
 import re
 import string
 from typing import List, Tuple
 import spacy
 from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
 
-from . import markdown as md
 
 PARSER = spacy.load("en_core_web_sm")
 
@@ -142,6 +145,19 @@ def noun_phrase(input_text):
     return n_phrase_lst
 
 
+def concatenate(responses_df):
+    """Remove stop words from and return contcatenated string of all words."""
+    words_str = ''
+    for i, row in responses_df.iterrows():
+        for col in range(len(responses_df.columns)):
+            val = row[col]
+            tokens = val.split()
+            for i in range(len(tokens)):
+                tokens[i] = tokens[i].lower()
+            words_str += " ".join(tokens)+" "
+    return words_str
+
+
 def top_polarized_word(tokens_column):
     """Create columns for positive and negative words"""
     # Start off with empty lists

diff --git a/streamlit_web.py b/streamlit_web.py
@@ -21,21 +21,27 @@
 import src.topic_modeling as tm
 import src.visualization as vis
 
+from wordcloud import WordCloud, STOPWORDS
 
 # resources/sample_reflections/lab1, resources/sample_reflections/lab2
 
 # initialize main_df and preprocessed_Df
 SPACY_MODEL_NAMES = ["en_core_web_sm", "en_core_web_md"]
 preprocessed_df = pd.DataFrame()
 main_df = pd.DataFrame()
+sample = []
 assignments = None
 assign_text = None
 stu_id = None
 success_msg = None
 debug_mode = False
+
+json_lst = []
+
 main_md_dict = None
 
 
+
 def main():
     """main streamlit function"""
     # Title
@@ -165,6 +171,7 @@ def retreive_data(data_retreive):
             return True
 
 
+
 @st.cache(allow_output_mutation=True)
 def load_model(name):
     """load spacy model"""
@@ -264,11 +271,12 @@ def frequency():
 
 
 def overall_freq(freq_range):
-    """page fore overall word frequency"""
+    """page for overall word frequency."""
     plots_range = st.sidebar.slider(
         "Select the number of plots per row", 1, 5, value=3
     )
     freq_df = pd.DataFrame(columns=["assignments", "word", "freq"])
+
     # calculate word frequency of each assignments
     for item in assignments:
         # combined text of the whole assignment
@@ -288,6 +296,13 @@ def overall_freq(freq_range):
         )
     )
 
+    responses_end = len(main_df.columns) - 3
+    responses_df = main_df[main_df.columns[1:responses_end]]
+    responses_df.replace("", "NA")
+
+    frequency_word_cloud(responses_df)
+
+    freq_df.to_csv('frequency_archives' + os.path.sep + str(item) + '.csv')
 
 def student_freq(freq_range):
     """page for individual student's word frequency"""
@@ -331,6 +346,12 @@ def student_freq(freq_range):
             )
         )
 
+        responses_end = len(stu_assignment.columns) - 3
+        responses_df = stu_assignment[stu_assignment.columns[1:responses_end]]
+        responses_df.replace("", "NA")
+
+        frequency_word_cloud(responses_df)
+
 
 def question_freq(freq_range):
     """page for individual question's word frequency"""
@@ -377,6 +398,23 @@ def question_freq(freq_range):
                 plots_per_row=plots_range,
             )
         )
+        frequency_word_cloud(question_df)
+
+
+def frequency_word_cloud(responses_df):
+    """Build wordcloud out of page's responses."""
+    # concatenate all words into normalized string and make into wordcloud
+    words = az.concatenate(responses_df)
+    cloud_stopwords = set(STOPWORDS)
+    wordcloud = (WordCloud(width = 800, height = 800,
+                    background_color = 'white',
+                    stopwords = cloud_stopwords,
+                    min_font_size = 10).generate(words))
+
+    # plot wordcloud by temporarily savings as a file and displaying
+    wordcloud.to_file("resources/images/word_cloud.png")
+    st.image("resources/images/word_cloud.png")
+    os.remove("resources/images/word_cloud.png")
 
 
 def sentiment():

diff --git a/tests/test_analyzer.py b/tests/test_analyzer.py
@@ -1,4 +1,5 @@
 """Test module for analyzer.py"""
+
 import pytest
 import src.analyzer as az
 import pandas as pd
@@ -146,7 +147,7 @@ def test_sentence_tokenize():
 
 
 def test_tfidf():
-    """test tfidf return result"""
+    """Test tfidf return result."""
     input_tokens = [
         "test",
         "tokenize",
@@ -161,6 +162,22 @@ def test_tfidf():
     assert vector is not None
 
 
+def test_concatenate():
+    """Test for contcatenated string of all words."""
+    input_dict = {
+        "What was the most important technical skill that you practiced?":
+        ["Using pipenv and pytest", "Naming variables in Python"],
+        "What was the most important professional skill that you practiced?":
+        ["Communicating with a team remotely", "Resolving issues by talking \
+        to teammates"]
+    }
+    input_df = pd.DataFrame(input_dict)
+    output = az.concatenate(input_df)
+    expected = "using pipenv and pytest communicating with a team remotely \
+naming variables in python resolving issues by talking to teammates "
+    assert output == expected
+
+
 def test_top_polarized_word():
     """Tests if the positive/negative words columns are created"""
     df = pd.DataFrame(columns=[cts.TOKEN, cts.POSITIVE, cts.NEGATIVE])

diff --git a/text_classifier b/text_classifier