import pandas as pd


# This time let's skip the 2nd row, which contains questions
kaggle2022 = pd.read_csv(
    '../data/kaggle_survey_2021_responses.csv',
    skiprows = [1]
)
kaggle2022.head(n = 1)

/tmp/ipykernel_779556/1478523914.py:2: DtypeWarning: Columns (195,201) have mixed types. Specify dtype option on import or set low_memory=False.
  kaggle2022 = pd.read_csv(


# We will load the questions as a separate dataset
kaggle2022_qs = pd.read_csv(
    '../data/kaggle_survey_2021_responses.csv',
    nrows = 1
)
kaggle2022_qs


# pd.DataFrame.describe() provides an range of summary statistics
kaggle2022.describe()


kaggle2022.iloc[:,0].mean() # Rather than using describe(), we can apply individual methods

11054.66492126439


kaggle2022.iloc[:,0].median() # Median

656.0


kaggle2022.iloc[:,0].std() # Standard deviation

101471.6221245172


import statistics ## We don't have to rely only on methods provided by `pandas`
statistics.stdev(kaggle2022.iloc[:,0])

101471.6221245172


# Adding include = 'all' tells pandas to summarize all variables
kaggle2022.describe(include = 'all')


kaggle2022.iloc[:,2].mode() # Mode, most frequent value

0    Man
Name: Q2, dtype: object


kaggle2022.iloc[:,2].value_counts() # Counts of unique values

Man                        20598
Woman                       4890
Prefer not to say            355
Nonbinary                     88
Prefer to self-describe       42
Name: Q2, dtype: int64


# We can further normalize them by the number of rows
kaggle2022.iloc[:,2].value_counts(normalize = True)

Man                        0.793054
Woman                      0.188272
Prefer not to say          0.013668
Nonbinary                  0.003388
Prefer to self-describe    0.001617
Name: Q2, dtype: float64


df_wide = pd.DataFrame({
  'country': ['Afghanistan', 'Brazil'],
  '1999': [745, 2666],
  '2000': [37737, 80488]
})
df_wide


# Pivoting long
df_long = df_wide.melt(
    id_vars = 'country',
    var_name = 'year',
    value_name = 'cases'
)
df_long


# Pivoting wide
df_wide = df_long.pivot(
    index = 'country',
    columns = 'year',
    values = 'cases'
)
df_wide


# As using pivot creates an index from
# the column used as the row labels, we
# may want to use reset_index to move 
# the data back into a column
df_wide.reset_index()


# Calculate crosstabulation between 'Age group' (Q1) and 'Gender' (Q2)
pd.crosstab(kaggle2022['Q1'], kaggle2022['Q2'])


# It is often useful to see the proportions/percentages rather than raw counts
pd.crosstab(kaggle2022['Q1'], kaggle2022['Q2'], normalize = 'columns')


# For `values` variable we use `Q3`, but any other would work equally well 
pd.pivot_table(
    kaggle2022, index = 'Q1', columns = 'Q2', values = 'Q3',
    aggfunc = 'count', fill_value = 0
)


from plotnine import *


q1_plot = ggplot(data = kaggle2022) + geom_bar(aes(x = 'Q1')) # Basic 'Age group' (Q1) bar chart
q1_plot

<ggplot: (8791535819521)>


# First we need to group dataset by 'Age group' (Q1) and summarize it with `size()`
kaggle2022_q1_grouped = kaggle2022.groupby(['Q1']).size() 
kaggle2022_q1_grouped.head(n = 3)

Q1
18-21    4901
22-24    4694
25-29    4931
dtype: int64


%matplotlib inline
kaggle2022_q1_grouped.plot(kind = 'bar')

<AxesSubplot: xlabel='Q1'>


import matplotlib.pyplot as plt


# `matplotlib` is more low-level library
# plots would need more work to be 'prettified'
plt.bar(
    x = kaggle2022_q1_grouped.index,
    height = kaggle2022_q1_grouped.values
)

<BarContainer object of 11 artists>


# Here we change default axes' labels and then apply B&W theme
q1_plot_pretty = q1_plot +\
    labs(x = 'Age group', y = 'respondents') +\
    theme_bw()
q1_plot_pretty

<ggplot: (8791533522590)>


q1_plot_pretty.save('../temp/q1_plot_pretty.pdf')

/home/tp1587/Decrypted/Git/Python_Social_Data_Science/venv/lib/python3.10/site-packages/plotnine/ggplot.py:718: PlotnineWarning: Saving 6.4 x 4.8 in image.
/home/tp1587/Decrypted/Git/Python_Social_Data_Science/venv/lib/python3.10/site-packages/plotnine/ggplot.py:719: PlotnineWarning: Filename: ../temp/q1_plot_pretty.pdf


import pandas as pd
anscombe_quartet = pd.read_csv('../data/anscombes_quartet.csv')


anscombe_quartet.head()


# Here we use `groupby` method to create summary by a variable ('dataset')
anscombe_quartet.groupby(['dataset']).describe()


from plotnine import *

ggplot(anscombe_quartet, aes(x = 'x', y = 'y')) +\
    geom_point(colour = 'red') +\
    geom_smooth(method = 'lm', se = False, fullrange = True) +\
    facet_wrap('dataset') +\
    theme_bw()

<ggplot: (8791533512158)>


import statsmodels.api as sm
import statsmodels.formula.api as smf # Formula API provides R-style formula specification


kaggle2022 = pd.read_csv('../data/kaggle_survey_2021_responses.csv', skiprows = [1])

/tmp/ipykernel_779556/2819004467.py:1: DtypeWarning: Columns (195,201) have mixed types. Specify dtype option on import or set low_memory=False.


# Let's give more intuitive names to out variables
kaggle2022 = kaggle2022.rename(columns = {
                'Q1': 'age',
                'Q2': 'gender',
                'Q3': 'country',
                'Q4': 'education',
                'Q25': 'compensation'})


kaggle2022['compensation'].head(n = 2)

0    25,000-29,999
1    60,000-69,999
Name: compensation, dtype: object


from statistics import mean
# Here we are replacing the compensation range by its midpoint (i.e. 112499.5 for $100,000-$124,999)
# This variable requires substantial cleaning before transformation
# Such as extraneous symbols ('$', ',', '>') have to be removed
kaggle2022['compensation'] = kaggle2022['compensation'].map(
    lambda x: mean([float(x.replace(',','').replace('$','').replace('>','')) for x in str(x).split('-')])
)


# Level of compensation (in USD, our DV)
kaggle2022['compensation']

0        27499.5
1        64999.5
2          499.5
3        34999.5
4        34999.5
          ...   
25968    17499.5
25969        NaN
25970      499.5
25971        NaN
25972      499.5
Name: compensation, Length: 25973, dtype: float64


# Frequencies of gender categories (our IV)
kaggle2022['gender'].value_counts()

Man                        20598
Woman                       4890
Prefer not to say            355
Nonbinary                     88
Prefer to self-describe       42
Name: gender, dtype: int64


# Formula specification allows to write 
# 'DV ~ IV_1 + IV_2 + ... + IV_N' as model specification
fit1 = smf.ols('compensation ~ gender', data = kaggle2022).fit()


fit1.summary()


# Let's now also control for age and education
fit2 = (
    smf
    .ols('compensation ~ gender + age + education', data = kaggle2022)
    .fit()
)


fit2.summary()


# Let's revisit the summary statistics of Anscombe's quartet
anscombe_quartet.groupby(['dataset']).describe().iloc[:,0:3]


print(anscombe_quartet.groupby(['dataset']).describe().iloc[:,0:3].to_markdown(index = False))

|   ('x', 'count') |   ('x', 'mean') |   ('x', 'std') |
|-----------------:|----------------:|---------------:|
|               11 |               9 |        3.31662 |
|               11 |               9 |        3.31662 |
|               11 |               9 |        3.31662 |
|               11 |               9 |        3.31662 |

Method	Numeric	Categorical	Description
`count`	yes	yes	Number of non-NA observations
`value_counts`	yes	yes	Number of unique observations by value
`describe`	yes	yes	Set of summary statistics for Series/DataFrame
`min`, `max`	yes	yes (caution)	Minimum and maximum values
`quantile`	yes	no	Sample quantile ranging from 0 to 1
`sum`	yes	yes (caution)	Sum of values
`prod`	yes	no	Product of values
`mean`	yes	no	Mean
`median`	yes	no	Median (50% quantile)
`var`	yes	no	Sample variance
`std`	yes	no	Sample standard deviation
`skew`	yes	no	Sample skewness (third moment)
`kurt`	yes	no	Sample kurtosis (fourth moment)

Q2	Man	Nonbinary	Prefer not to say	Prefer to self-describe	Woman
Q1
18-21	0.179435	0.181818	0.169014	0.285714	0.228425
22-24	0.176862	0.147727	0.185915	0.214286	0.196933
25-29	0.187348	0.136364	0.171831	0.119048	0.203272
30-34	0.134236	0.193182	0.095775	0.166667	0.126380
35-39	0.096757	0.079545	0.118310	0.166667	0.093047
40-44	0.074619	0.045455	0.087324	0.023810	0.064826
45-49	0.056850	0.045455	0.067606	0.023810	0.035787
50-54	0.039373	0.034091	0.039437	0.000000	0.027812
55-59	0.024711	0.045455	0.019718	0.000000	0.014724
60-69	0.024468	0.045455	0.028169	0.000000	0.007157
70+	0.005340	0.045455	0.016901	0.000000	0.001636

Method	Description
`geom_bar()`, `geom_col()`	Bar charts
`geom_boxplot()`	Box and whisker plot
`geom_histogram()`	Histogram
`geom_point()`	Scatterplot
`geom_line()`, `geom_path()`	Lines
`geom_map()`	Geographic areas
`geom_smooth()`	Smoothed conditional means
`geom_violin()`	Violin plots

	dataset	x	y
0	I	10	8.04
1	I	8	6.95
2	I	13	7.58
3	I	9	8.81
4	I	11	8.33

	x								y
	count	mean	std	min	25%	50%	75%	max	count	mean	std	min	25%	50%	75%	max
dataset
I	11.0	9.0	3.316625	4.0	6.5	9.0	11.5	14.0	11.0	7.500909	2.031568	4.26	6.315	7.58	8.57	10.84
II	11.0	9.0	3.316625	4.0	6.5	9.0	11.5	14.0	11.0	7.500909	2.031657	3.10	6.695	8.14	8.95	9.26
III	11.0	9.0	3.316625	4.0	6.5	9.0	11.5	14.0	11.0	7.500000	2.030424	5.39	6.250	7.11	7.98	12.74
IV	11.0	9.0	3.316625	8.0	8.0	8.0	8.0	19.0	11.0	7.500909	2.030579	5.25	6.170	7.04	8.19	12.50

	Time from Start to Finish (seconds)	Q30_B_Part_1	Q30_B_Part_2	Q30_B_Part_3	Q30_B_Part_4	Q30_B_Part_5	Q30_B_Part_6	Q30_B_Part_7	Q30_B_OTHER
count	2.597300e+04	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
mean	1.105466e+04	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
std	1.014716e+05	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
min	1.200000e+02	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
25%	4.430000e+02	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
50%	6.560000e+02	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
75%	1.038000e+03	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
max	2.488653e+06	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

	Time from Start to Finish (seconds)	Q1	Q2	Q3	Q4	Q5	Q6	Q7_Part_1	Q7_Part_2	Q7_Part_3	...	Q38_B_Part_3	Q38_B_Part_4	Q38_B_Part_5	Q38_B_Part_6	Q38_B_Part_7	Q38_B_Part_8	Q38_B_Part_9	Q38_B_Part_10	Q38_B_Part_11	Q38_B_OTHER
count	2.597300e+04	25973	25973	25973	25973	25973	25973	21860	5334	10756	...	633	591	4239	729	737	1020	666	2747	4542	377
unique	NaN	11	5	66	7	15	7	1	1	1	...	1	1	1	1	1	1	1	1	1	1
top	NaN	25-29	Man	India	Master’s degree	Student	1-3 years	Python	R	SQL	...	Comet.ml	Sacred + Omniboard	TensorBoard	Guild.ai	Polyaxon	ClearML	Domino Model Monitor	MLflow	None	Other
freq	NaN	4931	20598	7434	10132	6804	7874	21860	5334	10756	...	633	591	4239	729	737	1020	666	2747	4542	377
mean	1.105466e+04	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
std	1.014716e+05	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
min	1.200000e+02	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
25%	4.430000e+02	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
50%	6.560000e+02	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
75%	1.038000e+03	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
max	2.488653e+06	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

	country	year	cases
0	Afghanistan	1999	745
1	Brazil	1999	2666
2	Afghanistan	2000	37737
3	Brazil	2000	80488

Q2	Man	Nonbinary	Prefer not to say	Prefer to self-describe	Woman
Q1
18-21	3696	16	60	12	1117
22-24	3643	13	66	9	963
25-29	3859	12	61	5	994
30-34	2765	17	34	7	618
35-39	1993	7	42	7	455
40-44	1537	4	31	1	317
45-49	1171	4	24	1	175
50-54	811	3	14	0	136
55-59	509	4	7	0	72
60-69	504	4	10	0	35
70+	110	4	6	0	8

Dep. Variable:	compensation	R-squared:	0.007
Model:	OLS	Adj. R-squared:	0.007
Method:	Least Squares	F-statistic:	26.46
Date:	Sun, 20 Nov 2022	Prob (F-statistic):	6.64e-22
Time:	15:32:20	Log-Likelihood:	-1.9707e+05
No. Observations:	15391	AIC:	3.941e+05
Df Residuals:	15386	BIC:	3.942e+05
Df Model:	4
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	4.593e+04	782.805	58.672	0.000	4.44e+04	4.75e+04
gender[T.Nonbinary]	7.016e+04	1.29e+04	5.455	0.000	4.49e+04	9.54e+04
gender[T.Prefer not to say]	2.166e+04	6335.402	3.419	0.001	9242.046	3.41e+04
gender[T.Prefer to self-describe]	2.498e+04	1.8e+04	1.389	0.165	-1.03e+04	6.02e+04
gender[T.Woman]	-1.466e+04	1932.351	-7.588	0.000	-1.84e+04	-1.09e+04

Omnibus:	18458.954	Durbin-Watson:	2.001
Prob(Omnibus):	0.000	Jarque-Bera (JB):	2343660.164
Skew:	6.472	Prob(JB):	0.00
Kurtosis:	62.051	Cond. No.	25.7

Omnibus:	19216.780	Durbin-Watson:	2.008
Prob(Omnibus):	0.000	Jarque-Bera (JB):	2958551.621
Skew:	6.892	Prob(JB):	0.00
Kurtosis:	69.509	Cond. No.	30.2


pd.DataFrame.pivot()	pd.DataFrame.melt()

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	1.483e+04	2910.746	5.093	0.000	9120.353	2.05e+04
gender[T.Nonbinary]	6.443e+04	1.25e+04	5.163	0.000	4e+04	8.89e+04
gender[T.Prefer not to say]	2.033e+04	6157.104	3.302	0.001	8261.253	3.24e+04
gender[T.Prefer to self-describe]	3.486e+04	1.74e+04	1.998	0.046	667.732	6.91e+04
gender[T.Woman]	-1.1e+04	1884.889	-5.838	0.000	-1.47e+04	-7309.568
age[T.22-24]	2313.5562	3387.043	0.683	0.495	-4325.450	8952.562
age[T.25-29]	8811.2235	3235.996	2.723	0.006	2468.289	1.52e+04
age[T.30-34]	2.427e+04	3347.112	7.251	0.000	1.77e+04	3.08e+04
age[T.35-39]	3.405e+04	3485.133	9.770	0.000	2.72e+04	4.09e+04
age[T.40-44]	4.192e+04	3656.035	11.465	0.000	3.48e+04	4.91e+04
age[T.45-49]	5.368e+04	3872.345	13.862	0.000	4.61e+04	6.13e+04
age[T.50-54]	5.354e+04	4211.610	12.713	0.000	4.53e+04	6.18e+04
age[T.55-59]	6.666e+04	4792.714	13.910	0.000	5.73e+04	7.61e+04
age[T.60-69]	5.718e+04	4971.283	11.502	0.000	4.74e+04	6.69e+04
age[T.70+]	6.918e+04	9180.635	7.535	0.000	5.12e+04	8.72e+04
education[T.Doctoral degree]	1.187e+04	2322.458	5.111	0.000	7318.325	1.64e+04
education[T.I prefer not to answer]	-1.029e+04	4857.239	-2.118	0.034	-1.98e+04	-769.130
education[T.Master’s degree]	7168.1892	1666.521	4.301	0.000	3901.611	1.04e+04
education[T.No formal education past high school]	-1.059e+04	5801.400	-1.825	0.068	-2.2e+04	782.116
education[T.Professional doctorate]	8857.2561	5216.498	1.698	0.090	-1367.698	1.91e+04
education[T.Some college/university study without earning a bachelor’s degree]	-912.0023	3378.159	-0.270	0.787	-7533.593	5709.588

Week 6: Data Analysis and Communicating Results¶

Python for Social Data Science¶

Tom Paskhalis¶

Exploratory data analysis¶

Measurement scales¶

Measurement scales in Pandas¶

Loading the dataset¶

Loading the dataset continued¶

Summarizing numeric variables¶

Methods for summarizing numeric variables¶

Summarizing categorical variables¶

Methods for summarizing categorical variables¶

Summary of descriptive statistics methods¶

Pivoting data in pandas¶

pd.DataFrame.pivot()

pd.DataFrame.melt()

Pivoting data example¶

Pivoting data example continued¶

Crosstabulation¶

Crosstabulation in pandas¶

Margins in crosstab¶

Crosstabulation with pivot_table¶

Data visualization¶

Data visualization in Python¶

plotnine - ggplot for Python¶

Grammar of graphics¶

Structure of ggplot calls in plotnine¶

Creating a ggplot in plotnine¶

Compare to base pandas¶

Compare to matplotlib¶

Prettifying ggplot in plotnine¶

Other geometric objects (geom_)¶

Writing plots out in plotnine¶

Additional visualization materials¶

Regression analysis¶

Anscombe's quartet¶

Data for Anscombe's quartet¶

Summary statistics for Anscombe's quartet¶

Plotting Anscombe's quartet¶

Linear regression¶

Linear regression in Python¶

Data transformation¶

Pandas and linear regression¶

Formula specification¶

Model summary¶

Multiple linear regression¶

Multiple linear regression continued¶

Markdown - a language of reports¶

Formatting text in Markdown¶

Lists in Markdown¶

Headers in Markdown¶

Images and links in Markdown¶

Tables in Markdown¶

Markdown tables in pandas¶

The end¶

Crosstabulation with `pivot_table`¶

`plotnine` - `ggplot` for Python¶

Structure of ggplot calls in `plotnine`¶

Creating a ggplot in `plotnine`¶

Compare to base `pandas`¶

Compare to `matplotlib`¶

Prettifying ggplot in `plotnine`¶

Other geometric objects (`geom_`)¶

Writing plots out in `plotnine`¶