import pandas as pd
# This time let's skip the 2nd row, which contains questions
kaggle2021 = pd.read_csv('../data/kaggle_survey_2021_responses.csv', skiprows = [1])
kaggle2021.head(n = 1)
/tmp/ipykernel_20081/2302513793.py:2: DtypeWarning: Columns (195,201) have mixed types. Specify dtype option on import or set low_memory=False. kaggle2021 = pd.read_csv('../data/kaggle_survey_2021_responses.csv', skiprows = [1])
Time from Start to Finish (seconds) | Q1 | Q2 | Q3 | Q4 | Q5 | Q6 | Q7_Part_1 | Q7_Part_2 | Q7_Part_3 | ... | Q38_B_Part_3 | Q38_B_Part_4 | Q38_B_Part_5 | Q38_B_Part_6 | Q38_B_Part_7 | Q38_B_Part_8 | Q38_B_Part_9 | Q38_B_Part_10 | Q38_B_Part_11 | Q38_B_OTHER | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 910 | 50-54 | Man | India | Bachelor’s degree | Other | 5-10 years | Python | R | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
1 rows × 369 columns
# We will load the questions as a separate dataset
kaggle2021_qs = pd.read_csv('../data/kaggle_survey_2021_responses.csv', nrows = 1)
kaggle2021_qs
Time from Start to Finish (seconds) | Q1 | Q2 | Q3 | Q4 | Q5 | Q6 | Q7_Part_1 | Q7_Part_2 | Q7_Part_3 | ... | Q38_B_Part_3 | Q38_B_Part_4 | Q38_B_Part_5 | Q38_B_Part_6 | Q38_B_Part_7 | Q38_B_Part_8 | Q38_B_Part_9 | Q38_B_Part_10 | Q38_B_Part_11 | Q38_B_OTHER | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Duration (in seconds) | What is your age (# years)? | What is your gender? - Selected Choice | In which country do you currently reside? | What is the highest level of formal education ... | Select the title most similar to your current ... | For how many years have you been writing code ... | What programming languages do you use on a reg... | What programming languages do you use on a reg... | What programming languages do you use on a reg... | ... | In the next 2 years, do you hope to become mor... | In the next 2 years, do you hope to become mor... | In the next 2 years, do you hope to become mor... | In the next 2 years, do you hope to become mor... | In the next 2 years, do you hope to become mor... | In the next 2 years, do you hope to become mor... | In the next 2 years, do you hope to become mor... | In the next 2 years, do you hope to become mor... | In the next 2 years, do you hope to become mor... | In the next 2 years, do you hope to become mor... |
1 rows × 369 columns
NaN
)kaggle2021.describe() # DataFrame.describe() provides an range of summary statistics
Time from Start to Finish (seconds) | Q30_B_Part_1 | Q30_B_Part_2 | Q30_B_Part_3 | Q30_B_Part_4 | Q30_B_Part_5 | Q30_B_Part_6 | Q30_B_Part_7 | Q30_B_OTHER | |
---|---|---|---|---|---|---|---|---|---|
count | 2.597300e+04 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
mean | 1.105466e+04 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
std | 1.014716e+05 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
min | 1.200000e+02 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
25% | 4.430000e+02 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
50% | 6.560000e+02 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
75% | 1.038000e+03 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
max | 2.488653e+06 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
kaggle2021.iloc[:,0].mean() # Rather than using describe(), we can apply individual methods
11054.66492126439
kaggle2021.iloc[:,0].median() # Median
656.0
kaggle2021.iloc[:,0].std() # Standard deviation
101471.6221245172
import statistics ## We don't have to rely only on methods provided by `pandas`
statistics.stdev(kaggle2021.iloc[:,0])
101471.6221245172
kaggle2021.describe(include = 'all') # Adding include = 'all' tells pandas to summarize all variables
Time from Start to Finish (seconds) | Q1 | Q2 | Q3 | Q4 | Q5 | Q6 | Q7_Part_1 | Q7_Part_2 | Q7_Part_3 | ... | Q38_B_Part_3 | Q38_B_Part_4 | Q38_B_Part_5 | Q38_B_Part_6 | Q38_B_Part_7 | Q38_B_Part_8 | Q38_B_Part_9 | Q38_B_Part_10 | Q38_B_Part_11 | Q38_B_OTHER | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 2.597300e+04 | 25973 | 25973 | 25973 | 25973 | 25973 | 25973 | 21860 | 5334 | 10756 | ... | 633 | 591 | 4239 | 729 | 737 | 1020 | 666 | 2747 | 4542 | 377 |
unique | NaN | 11 | 5 | 66 | 7 | 15 | 7 | 1 | 1 | 1 | ... | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
top | NaN | 25-29 | Man | India | Master’s degree | Student | 1-3 years | Python | R | SQL | ... | Comet.ml | Sacred + Omniboard | TensorBoard | Guild.ai | Polyaxon | ClearML | Domino Model Monitor | MLflow | None | Other |
freq | NaN | 4931 | 20598 | 7434 | 10132 | 6804 | 7874 | 21860 | 5334 | 10756 | ... | 633 | 591 | 4239 | 729 | 737 | 1020 | 666 | 2747 | 4542 | 377 |
mean | 1.105466e+04 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
std | 1.014716e+05 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
min | 1.200000e+02 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
25% | 4.430000e+02 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
50% | 6.560000e+02 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
75% | 1.038000e+03 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
max | 2.488653e+06 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
11 rows × 369 columns
kaggle2021.iloc[:,2].mode() # Mode, most frequent value
0 Man Name: Q2, dtype: object
kaggle2021.iloc[:,2].value_counts() # Counts of unique values
Man 20598 Woman 4890 Prefer not to say 355 Nonbinary 88 Prefer to self-describe 42 Name: Q2, dtype: int64
kaggle2021.iloc[:,2].value_counts(normalize = True) # We can further normalize them by the number of rows
Man 0.793054 Woman 0.188272 Prefer not to say 0.013668 Nonbinary 0.003388 Prefer to self-describe 0.001617 Name: Q2, dtype: float64
Method | Numeric | Categorical | Description |
---|---|---|---|
count |
yes | yes | Number of non-NA observations |
value_counts |
yes | yes | Number of unique observations by value |
describe |
yes | yes | Set of summary statistics for Series/DataFrame |
min , max |
yes | yes (caution) | Minimum and maximum values |
quantile |
yes | no | Sample quantile ranging from 0 to 1 |
sum |
yes | yes (caution) | Sum of values |
prod |
yes | no | Product of values |
mean |
yes | no | Mean |
median |
yes | no | Median (50% quantile) |
var |
yes | no | Sample variance |
std |
yes | no | Sample standard deviation |
skew |
yes | no | Sample skewness (third moment) |
kurt |
yes | no | Sample kurtosis (fourth moment) |
pd.crosstab()
method is a special case of pd.pivot_table()
# Calculate crosstabulation between 'Age group' (Q1) and 'Gender' (Q2)
pd.crosstab(kaggle2021['Q1'], kaggle2021['Q2'])
Q2 | Man | Nonbinary | Prefer not to say | Prefer to self-describe | Woman |
---|---|---|---|---|---|
Q1 | |||||
18-21 | 3696 | 16 | 60 | 12 | 1117 |
22-24 | 3643 | 13 | 66 | 9 | 963 |
25-29 | 3859 | 12 | 61 | 5 | 994 |
30-34 | 2765 | 17 | 34 | 7 | 618 |
35-39 | 1993 | 7 | 42 | 7 | 455 |
40-44 | 1537 | 4 | 31 | 1 | 317 |
45-49 | 1171 | 4 | 24 | 1 | 175 |
50-54 | 811 | 3 | 14 | 0 | 136 |
55-59 | 509 | 4 | 7 | 0 | 72 |
60-69 | 504 | 4 | 10 | 0 | 35 |
70+ | 110 | 4 | 6 | 0 | 8 |
# It is often useful to see the proportions/percentages rather than raw counts
pd.crosstab(kaggle2021['Q1'], kaggle2021['Q2'], normalize = 'columns')
Q2 | Man | Nonbinary | Prefer not to say | Prefer to self-describe | Woman |
---|---|---|---|---|---|
Q1 | |||||
18-21 | 0.179435 | 0.181818 | 0.169014 | 0.285714 | 0.228425 |
22-24 | 0.176862 | 0.147727 | 0.185915 | 0.214286 | 0.196933 |
25-29 | 0.187348 | 0.136364 | 0.171831 | 0.119048 | 0.203272 |
30-34 | 0.134236 | 0.193182 | 0.095775 | 0.166667 | 0.126380 |
35-39 | 0.096757 | 0.079545 | 0.118310 | 0.166667 | 0.093047 |
40-44 | 0.074619 | 0.045455 | 0.087324 | 0.023810 | 0.064826 |
45-49 | 0.056850 | 0.045455 | 0.067606 | 0.023810 | 0.035787 |
50-54 | 0.039373 | 0.034091 | 0.039437 | 0.000000 | 0.027812 |
55-59 | 0.024711 | 0.045455 | 0.019718 | 0.000000 | 0.014724 |
60-69 | 0.024468 | 0.045455 | 0.028169 | 0.000000 | 0.007157 |
70+ | 0.005340 | 0.045455 | 0.016901 | 0.000000 | 0.001636 |
pivot_table
¶# For `values` variable we use `Q3`, but any other would work equally well
pd.pivot_table(kaggle2021, index = 'Q1', columns = 'Q2', values = 'Q3', aggfunc = 'count', fill_value = 0)
Q2 | Man | Nonbinary | Prefer not to say | Prefer to self-describe | Woman |
---|---|---|---|---|---|
Q1 | |||||
18-21 | 3696 | 16 | 60 | 12 | 1117 |
22-24 | 3643 | 13 | 66 | 9 | 963 |
25-29 | 3859 | 12 | 61 | 5 | 994 |
30-34 | 2765 | 17 | 34 | 7 | 618 |
35-39 | 1993 | 7 | 42 | 7 | 455 |
40-44 | 1537 | 4 | 31 | 1 | 317 |
45-49 | 1171 | 4 | 24 | 1 | 175 |
50-54 | 811 | 3 | 14 | 0 | 136 |
55-59 | 509 | 4 | 7 | 0 | 72 |
60-69 | 504 | 4 | 10 | 0 | 35 |
70+ | 110 | 4 | 6 | 0 | 8 |
matplotlib
has become the one of standard solutionspandas
seaborn
and plotnine
pandas
itself has some limited plotting facilitiesplotnine
- ggplot
for Python¶plotnine
implements Grammar of Graphics data visualisation scheme (Wilkinson, 2005)ggplot2
syntax (Wickham, 2010)plotnine
¶plotline
has the following structure:ggplot(data = <DATA>) +\
<GEOM_FUNCTION>(mapping = aes(<MAPPINGS>))
ggplot(data = <DATA>, mapping = aes(<MAPPINGS>)) +\
<GEOM_FUNCTION>() +\
<GEOM_FUNCTION>()
plotnine
¶from plotnine import *
q1_plot = ggplot(data = kaggle2021) + geom_bar(aes(x = 'Q1')) # Basic 'Age group' (Q1) bar chart
q1_plot
<ggplot: (8783323507023)>
pandas
¶# First we need to group dataset by 'Age group' (Q1) and summarize it with `size()`
kaggle2021_q1_grouped = kaggle2021.groupby(['Q1']).size()
kaggle2021_q1_grouped.head(n = 3)
Q1 18-21 4901 22-24 4694 25-29 4931 dtype: int64
kaggle2021_q1_grouped.plot(kind = 'bar')
<AxesSubplot:xlabel='Q1'>
matplotlib
¶import matplotlib.pyplot as plt
# `matplotlib` is more low-level library
# plots would need more work to be 'prettified'
plt.bar(x = kaggle2021_q1_grouped.index, height = kaggle2021_q1_grouped.values)
<BarContainer object of 11 artists>
plotnine
¶# Here we change default axes' labels and then apply B&W theme
q1_plot_pretty = q1_plot +\
labs(x = 'Age group', y = 'respondents') +\
theme_bw()
q1_plot_pretty
<ggplot: (8783321248031)>
geom_
)¶Method | Description |
---|---|
geom_bar() , geom_col() |
Bar charts |
geom_boxplot() |
Box and whisker plot |
geom_histogram() |
Histogram |
geom_point() |
Scatterplot |
geom_line() , geom_path() |
Lines |
geom_map() |
Geographic areas |
geom_smooth() |
Smoothed conditional means |
geom_violin() |
Violin plots |
plotnine
¶q1_plot_pretty.save('../temp/q1_plot_pretty.pdf')
/home/tpaskhalis/Decrypted/Git/RECSM/venv/lib/python3.8/site-packages/plotnine/ggplot.py:719: PlotnineWarning: Saving 6.4 x 4.8 in image. /home/tpaskhalis/Decrypted/Git/RECSM/venv/lib/python3.8/site-packages/plotnine/ggplot.py:722: PlotnineWarning: Filename: ../temp/q1_plot_pretty.pdf
Books:
Online: