Skip to content

WhitSade/Stack-Overflow-Tech-Trends

Repository files navigation

Stack Overflow Tech Trends

Objectives

  • Visualize the distribution of data.

  • Visualize the relationship between two features.

  • Visualize composition of data.

  • Visualize comparison of data.


Working with database

Download database file.

!wget https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DA0321EN-SkillsNetwork/LargeData/m4_survey_data.sqlite
--2021-02-15 18:10:03--  https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DA0321EN-SkillsNetwork/LargeData/m4_survey_data.sqlite
Resolving cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud (cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud)... 198.23.119.245
Connecting to cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud (cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud)|198.23.119.245|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 36679680 (35M) [application/octet-stream]
Saving to: ‘m4_survey_data.sqlite.4’

m4_survey_data.sqli 100%[===================>]  34.98M  43.7MB/s    in 0.8s    

2021-02-15 18:10:04 (43.7 MB/s) - ‘m4_survey_data.sqlite.4’ saved [36679680/36679680]

Connect to the database.

import sqlite3
conn = sqlite3.connect("m4_survey_data.sqlite") # open a database connection

Import pandas module.

import pandas as pd

Demo: How to run an sql query

# print how many rows are there in the table named 'master'
QUERY = """
SELECT COUNT(*)
FROM master
"""

# the read_sql_query runs the sql query and returns the data as a dataframe
df = pd.read_sql_query(QUERY,conn)
df.head()
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
COUNT(*)
0 11398

Demo: How to list all tables

# print all the tables names in the database
QUERY = """
SELECT name as Table_Name FROM
sqlite_master WHERE
type = 'table'
"""
# the read_sql_query runs the sql query and returns the data as a dataframe
pd.read_sql_query(QUERY,conn)
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
Table_Name
0 EduOther
1 DevType
2 LastInt
3 JobFactors
4 WorkPlan
5 WorkChallenge
6 LanguageWorkedWith
7 LanguageDesireNextYear
8 DatabaseWorkedWith
9 DatabaseDesireNextYear
10 PlatformWorkedWith
11 PlatformDesireNextYear
12 WebFrameWorkedWith
13 WebFrameDesireNextYear
14 MiscTechWorkedWith
15 MiscTechDesireNextYear
16 DevEnviron
17 Containers
18 SOVisitTo
19 SONewContent
20 Gender
21 Sexuality
22 Ethnicity
23 master

Demo: How to run a group by query

QUERY = """
SELECT Age,COUNT(*) as count
FROM master
group by age
order by age
"""
pd.read_sql_query(QUERY,conn)
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
Age count
0 NaN 287
1 16.0 3
2 17.0 6
3 18.0 29
4 19.0 78
5 20.0 109
6 21.0 203
7 22.0 406
8 23.0 581
9 24.0 679
10 25.0 738
11 26.0 720
12 27.0 724
13 28.0 787
14 29.0 697
15 30.0 651
16 31.0 531
17 32.0 489
18 33.0 483
19 34.0 395
20 35.0 393
21 36.0 308
22 37.0 280
23 38.0 279
24 39.0 232
25 40.0 187
26 41.0 136
27 42.0 162
28 43.0 100
29 44.0 95
30 45.0 85
31 46.0 66
32 47.0 68
33 48.0 64
34 49.0 66
35 50.0 57
36 51.0 29
37 52.0 41
38 53.0 32
39 54.0 26
40 55.0 13
41 56.0 16
42 57.0 11
43 58.0 12
44 59.0 11
45 60.0 2
46 61.0 10
47 62.0 5
48 63.0 7
49 65.0 2
50 66.0 1
51 67.0 1
52 69.0 1
53 71.0 2
54 72.0 1
55 99.0 1

Demo: How to describe a table

table_name = 'master'  # the table you wish to describe

QUERY = """
SELECT sql FROM sqlite_master
WHERE name= '{}'
""".format(table_name)

df = pd.read_sql_query(QUERY,conn)
print(df.iat[0,0])
CREATE TABLE "master" (
"index" INTEGER,
  "Respondent" INTEGER,
  "MainBranch" TEXT,
  "Hobbyist" TEXT,
  "OpenSourcer" TEXT,
  "OpenSource" TEXT,
  "Employment" TEXT,
  "Country" TEXT,
  "Student" TEXT,
  "EdLevel" TEXT,
  "UndergradMajor" TEXT,
  "OrgSize" TEXT,
  "YearsCode" TEXT,
  "Age1stCode" TEXT,
  "YearsCodePro" TEXT,
  "CareerSat" TEXT,
  "JobSat" TEXT,
  "MgrIdiot" TEXT,
  "MgrMoney" TEXT,
  "MgrWant" TEXT,
  "JobSeek" TEXT,
  "LastHireDate" TEXT,
  "FizzBuzz" TEXT,
  "ResumeUpdate" TEXT,
  "CurrencySymbol" TEXT,
  "CurrencyDesc" TEXT,
  "CompTotal" REAL,
  "CompFreq" TEXT,
  "ConvertedComp" REAL,
  "WorkWeekHrs" REAL,
  "WorkRemote" TEXT,
  "WorkLoc" TEXT,
  "ImpSyn" TEXT,
  "CodeRev" TEXT,
  "CodeRevHrs" REAL,
  "UnitTests" TEXT,
  "PurchaseHow" TEXT,
  "PurchaseWhat" TEXT,
  "OpSys" TEXT,
  "BlockchainOrg" TEXT,
  "BlockchainIs" TEXT,
  "BetterLife" TEXT,
  "ITperson" TEXT,
  "OffOn" TEXT,
  "SocialMedia" TEXT,
  "Extraversion" TEXT,
  "ScreenName" TEXT,
  "SOVisit1st" TEXT,
  "SOVisitFreq" TEXT,
  "SOFindAnswer" TEXT,
  "SOTimeSaved" TEXT,
  "SOHowMuchTime" TEXT,
  "SOAccount" TEXT,
  "SOPartFreq" TEXT,
  "SOJobs" TEXT,
  "EntTeams" TEXT,
  "SOComm" TEXT,
  "WelcomeChange" TEXT,
  "Age" REAL,
  "Trans" TEXT,
  "Dependents" TEXT,
  "SurveyLength" TEXT,
  "SurveyEase" TEXT
)

Visualizing distribution of data

Histograms

#Plot a histogram of ConvertedComp
QUERY = """
SELECT* FROM master
"""

df = pd.read_sql_query(QUERY,conn)
df.hist(column='ConvertedComp',grid=False)





array([[<matplotlib.axes._subplots.AxesSubplot object at 0x7f5a34099910>]],
      dtype=object)

png

Box Plots

#Plot a box plot of Age
Query = """
SELECT* FROM master
"""
df = pd.read_sql_query(QUERY,conn)
df.boxplot(column='Age')
<matplotlib.axes._subplots.AxesSubplot at 0x7f5a3ca0bdd0>

png

Visualizing relationships in data

Scatter Plots

# Create a scatter plot of Age & WorkWeekHrs
Query = """
SELECT* FROM master
"""
df = pd.read_sql_query(QUERY,conn)
df.plot(x='Age',y='WorkWeekHrs',kind='scatter')
<matplotlib.axes._subplots.AxesSubplot at 0x7f5a34476690>

png

Bubble Plots

Create a bubble plot of WorkWeekHrs and CodeRevHrs, use Age column as bubble size.

#Create a bubble plot of WorkWeekHrs and CodeRevHrs, using Age column as bubble size 
import matplotlib.pyplot as plt
import numpy as np

Query = """
SELECT WorkWeekHrs,CodeRevHrs,Age
FROM master
"""
df = pd.read_sql_query(QUERY,conn)
# create data
x = np.random.rand(40)
y = np.random.rand(40)
z = np.random.rand(40)
colors = np.random.rand(40) 
# use the scatter function
plt.scatter(x, y, s=z*1000,c=colors) 
plt.show()

png

Visualizing composition of data

Pie Charts

#pie chart of the top 5 databases that respondents wish to learn next year.
QUERY = """

select count(DatabaseDesireNextYear) as Count,DatabaseDesireNextYear from DatabaseDesireNextYear group by DatabaseDesireNextYear order by count(DatabaseDesireNextYear) DESC LIMIT 5

"""

df2=pd.read_sql_query(QUERY,conn)

df2.set_index('DatabaseDesireNextYear',inplace=True)

df2

labels= ['PstgreSQL','MongoDB','Redis','MySQL','Elasticsearch']
sizes=df2.iloc[:,0]
plt.pie(sizes,labels=labels,startangle=90,shadow=True,explode=(0.1,0.1,0.1,0.1,0.1),autopct='%1.2f%%')
plt.title('Top 5 Databases')
plt.show()

png

Stacked Charts

# stacked chart of median WorkWeekHrs and CodeRevHrs for the age group 30 to 35.
QUERY = """

SELECT WorkWeekHrs,CodeRevHrs,Age 
FROM master
WHERE Age BETWEEN 30 AND 35

"""


df_Age = pd.read_sql_query(QUERY,conn)
df_Age.head()

df_grouped=df_Age.groupby(df_Age.Age)[['WorkWeekHrs','CodeRevHrs']].median()
df_grouped.head()
df_grouped.plot(kind='bar', stacked=True)
<matplotlib.axes._subplots.AxesSubplot at 0x7f5a37384f50>

png

Visualizing comparison of data

Line Chart

# Plot the median ConvertedComp for all ages from 45 to 60.
QUERY = """

SELECT ConvertedComp, Age 
FROM master
WHERE Age BETWEEN 45 AND 60

"""
df4 = pd.read_sql_query(QUERY,conn)
df4.head()
df4grouped = df4.groupby(df4.Age)[['ConvertedComp']].median()
df4grouped.plot(kind='line',figsize=(12,5))
<matplotlib.axes._subplots.AxesSubplot at 0x7f5a34bb1e90>

png

Bar Chart

# horizontal bar chart using column MainBranch.
QUERY = """
SELECT MainBranch, Count(MainBranch)
FROM master
GROUP BY MainBranch
"""
df_Branch = pd.read_sql_query(QUERY,conn)
df_Branch.head()
df_Branch.plot(kind='barh',figsize=(15,5))
plt.xlabel('MainBranch')
Text(0.5, 0, 'MainBranch')

png

Close the database connection.

conn.close()

About

No description, website, or topics provided.

Resources

Stars

Watchers

Forks

Releases

No releases published

Packages

No packages published