-
Visualize the distribution of data.
-
Visualize the relationship between two features.
-
Visualize composition of data.
-
Visualize comparison of data.
Download database file.
!wget https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DA0321EN-SkillsNetwork/LargeData/m4_survey_data.sqlite
--2021-02-15 18:10:03-- https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DA0321EN-SkillsNetwork/LargeData/m4_survey_data.sqlite
Resolving cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud (cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud)... 198.23.119.245
Connecting to cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud (cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud)|198.23.119.245|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 36679680 (35M) [application/octet-stream]
Saving to: ‘m4_survey_data.sqlite.4’
m4_survey_data.sqli 100%[===================>] 34.98M 43.7MB/s in 0.8s
2021-02-15 18:10:04 (43.7 MB/s) - ‘m4_survey_data.sqlite.4’ saved [36679680/36679680]
Connect to the database.
import sqlite3
conn = sqlite3.connect("m4_survey_data.sqlite") # open a database connection
Import pandas module.
import pandas as pd
# print how many rows are there in the table named 'master'
QUERY = """
SELECT COUNT(*)
FROM master
"""
# the read_sql_query runs the sql query and returns the data as a dataframe
df = pd.read_sql_query(QUERY,conn)
df.head()
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
</style>
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
COUNT(*) | |
---|---|
0 | 11398 |
# print all the tables names in the database
QUERY = """
SELECT name as Table_Name FROM
sqlite_master WHERE
type = 'table'
"""
# the read_sql_query runs the sql query and returns the data as a dataframe
pd.read_sql_query(QUERY,conn)
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
</style>
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
Table_Name | |
---|---|
0 | EduOther |
1 | DevType |
2 | LastInt |
3 | JobFactors |
4 | WorkPlan |
5 | WorkChallenge |
6 | LanguageWorkedWith |
7 | LanguageDesireNextYear |
8 | DatabaseWorkedWith |
9 | DatabaseDesireNextYear |
10 | PlatformWorkedWith |
11 | PlatformDesireNextYear |
12 | WebFrameWorkedWith |
13 | WebFrameDesireNextYear |
14 | MiscTechWorkedWith |
15 | MiscTechDesireNextYear |
16 | DevEnviron |
17 | Containers |
18 | SOVisitTo |
19 | SONewContent |
20 | Gender |
21 | Sexuality |
22 | Ethnicity |
23 | master |
QUERY = """
SELECT Age,COUNT(*) as count
FROM master
group by age
order by age
"""
pd.read_sql_query(QUERY,conn)
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
</style>
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
Age | count | |
---|---|---|
0 | NaN | 287 |
1 | 16.0 | 3 |
2 | 17.0 | 6 |
3 | 18.0 | 29 |
4 | 19.0 | 78 |
5 | 20.0 | 109 |
6 | 21.0 | 203 |
7 | 22.0 | 406 |
8 | 23.0 | 581 |
9 | 24.0 | 679 |
10 | 25.0 | 738 |
11 | 26.0 | 720 |
12 | 27.0 | 724 |
13 | 28.0 | 787 |
14 | 29.0 | 697 |
15 | 30.0 | 651 |
16 | 31.0 | 531 |
17 | 32.0 | 489 |
18 | 33.0 | 483 |
19 | 34.0 | 395 |
20 | 35.0 | 393 |
21 | 36.0 | 308 |
22 | 37.0 | 280 |
23 | 38.0 | 279 |
24 | 39.0 | 232 |
25 | 40.0 | 187 |
26 | 41.0 | 136 |
27 | 42.0 | 162 |
28 | 43.0 | 100 |
29 | 44.0 | 95 |
30 | 45.0 | 85 |
31 | 46.0 | 66 |
32 | 47.0 | 68 |
33 | 48.0 | 64 |
34 | 49.0 | 66 |
35 | 50.0 | 57 |
36 | 51.0 | 29 |
37 | 52.0 | 41 |
38 | 53.0 | 32 |
39 | 54.0 | 26 |
40 | 55.0 | 13 |
41 | 56.0 | 16 |
42 | 57.0 | 11 |
43 | 58.0 | 12 |
44 | 59.0 | 11 |
45 | 60.0 | 2 |
46 | 61.0 | 10 |
47 | 62.0 | 5 |
48 | 63.0 | 7 |
49 | 65.0 | 2 |
50 | 66.0 | 1 |
51 | 67.0 | 1 |
52 | 69.0 | 1 |
53 | 71.0 | 2 |
54 | 72.0 | 1 |
55 | 99.0 | 1 |
table_name = 'master' # the table you wish to describe
QUERY = """
SELECT sql FROM sqlite_master
WHERE name= '{}'
""".format(table_name)
df = pd.read_sql_query(QUERY,conn)
print(df.iat[0,0])
CREATE TABLE "master" (
"index" INTEGER,
"Respondent" INTEGER,
"MainBranch" TEXT,
"Hobbyist" TEXT,
"OpenSourcer" TEXT,
"OpenSource" TEXT,
"Employment" TEXT,
"Country" TEXT,
"Student" TEXT,
"EdLevel" TEXT,
"UndergradMajor" TEXT,
"OrgSize" TEXT,
"YearsCode" TEXT,
"Age1stCode" TEXT,
"YearsCodePro" TEXT,
"CareerSat" TEXT,
"JobSat" TEXT,
"MgrIdiot" TEXT,
"MgrMoney" TEXT,
"MgrWant" TEXT,
"JobSeek" TEXT,
"LastHireDate" TEXT,
"FizzBuzz" TEXT,
"ResumeUpdate" TEXT,
"CurrencySymbol" TEXT,
"CurrencyDesc" TEXT,
"CompTotal" REAL,
"CompFreq" TEXT,
"ConvertedComp" REAL,
"WorkWeekHrs" REAL,
"WorkRemote" TEXT,
"WorkLoc" TEXT,
"ImpSyn" TEXT,
"CodeRev" TEXT,
"CodeRevHrs" REAL,
"UnitTests" TEXT,
"PurchaseHow" TEXT,
"PurchaseWhat" TEXT,
"OpSys" TEXT,
"BlockchainOrg" TEXT,
"BlockchainIs" TEXT,
"BetterLife" TEXT,
"ITperson" TEXT,
"OffOn" TEXT,
"SocialMedia" TEXT,
"Extraversion" TEXT,
"ScreenName" TEXT,
"SOVisit1st" TEXT,
"SOVisitFreq" TEXT,
"SOFindAnswer" TEXT,
"SOTimeSaved" TEXT,
"SOHowMuchTime" TEXT,
"SOAccount" TEXT,
"SOPartFreq" TEXT,
"SOJobs" TEXT,
"EntTeams" TEXT,
"SOComm" TEXT,
"WelcomeChange" TEXT,
"Age" REAL,
"Trans" TEXT,
"Dependents" TEXT,
"SurveyLength" TEXT,
"SurveyEase" TEXT
)
#Plot a histogram of ConvertedComp
QUERY = """
SELECT* FROM master
"""
df = pd.read_sql_query(QUERY,conn)
df.hist(column='ConvertedComp',grid=False)
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x7f5a34099910>]],
dtype=object)
#Plot a box plot of Age
Query = """
SELECT* FROM master
"""
df = pd.read_sql_query(QUERY,conn)
df.boxplot(column='Age')
<matplotlib.axes._subplots.AxesSubplot at 0x7f5a3ca0bdd0>
# Create a scatter plot of Age & WorkWeekHrs
Query = """
SELECT* FROM master
"""
df = pd.read_sql_query(QUERY,conn)
df.plot(x='Age',y='WorkWeekHrs',kind='scatter')
<matplotlib.axes._subplots.AxesSubplot at 0x7f5a34476690>
Create a bubble plot of WorkWeekHrs
and CodeRevHrs
, use Age
column as bubble size.
#Create a bubble plot of WorkWeekHrs and CodeRevHrs, using Age column as bubble size
import matplotlib.pyplot as plt
import numpy as np
Query = """
SELECT WorkWeekHrs,CodeRevHrs,Age
FROM master
"""
df = pd.read_sql_query(QUERY,conn)
# create data
x = np.random.rand(40)
y = np.random.rand(40)
z = np.random.rand(40)
colors = np.random.rand(40)
# use the scatter function
plt.scatter(x, y, s=z*1000,c=colors)
plt.show()
#pie chart of the top 5 databases that respondents wish to learn next year.
QUERY = """
select count(DatabaseDesireNextYear) as Count,DatabaseDesireNextYear from DatabaseDesireNextYear group by DatabaseDesireNextYear order by count(DatabaseDesireNextYear) DESC LIMIT 5
"""
df2=pd.read_sql_query(QUERY,conn)
df2.set_index('DatabaseDesireNextYear',inplace=True)
df2
labels= ['PstgreSQL','MongoDB','Redis','MySQL','Elasticsearch']
sizes=df2.iloc[:,0]
plt.pie(sizes,labels=labels,startangle=90,shadow=True,explode=(0.1,0.1,0.1,0.1,0.1),autopct='%1.2f%%')
plt.title('Top 5 Databases')
plt.show()
# stacked chart of median WorkWeekHrs and CodeRevHrs for the age group 30 to 35.
QUERY = """
SELECT WorkWeekHrs,CodeRevHrs,Age
FROM master
WHERE Age BETWEEN 30 AND 35
"""
df_Age = pd.read_sql_query(QUERY,conn)
df_Age.head()
df_grouped=df_Age.groupby(df_Age.Age)[['WorkWeekHrs','CodeRevHrs']].median()
df_grouped.head()
df_grouped.plot(kind='bar', stacked=True)
<matplotlib.axes._subplots.AxesSubplot at 0x7f5a37384f50>
# Plot the median ConvertedComp for all ages from 45 to 60.
QUERY = """
SELECT ConvertedComp, Age
FROM master
WHERE Age BETWEEN 45 AND 60
"""
df4 = pd.read_sql_query(QUERY,conn)
df4.head()
df4grouped = df4.groupby(df4.Age)[['ConvertedComp']].median()
df4grouped.plot(kind='line',figsize=(12,5))
<matplotlib.axes._subplots.AxesSubplot at 0x7f5a34bb1e90>
# horizontal bar chart using column MainBranch.
QUERY = """
SELECT MainBranch, Count(MainBranch)
FROM master
GROUP BY MainBranch
"""
df_Branch = pd.read_sql_query(QUERY,conn)
df_Branch.head()
df_Branch.plot(kind='barh',figsize=(15,5))
plt.xlabel('MainBranch')
Text(0.5, 0, 'MainBranch')
Close the database connection.
conn.close()