# import all packages and set plots to be embedded inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb

%matplotlib inline


# Read in dataset
df19 = pd.read_csv('usa_00006.csv')
df19.columns = df19.columns.str.lower()  # Change column names to lower case
df19.head()


df19.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43143 entries, 0 to 43142
Data columns (total 41 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   year       43143 non-null  int64  
 1   sample     43143 non-null  int64  
 2   serial     43143 non-null  int64  
 3   cbserial   43143 non-null  int64  
 4   hhwt       43143 non-null  float64
 5   cluster    43143 non-null  int64  
 6   cpi99      43143 non-null  float64
 7   statefip   43143 non-null  int64  
 8   strata     43143 non-null  int64  
 9   gq         43143 non-null  int64  
 10  pernum     43143 non-null  int64  
 11  perwt      43143 non-null  float64
 12  sex        43143 non-null  int64  
 13  age        43143 non-null  int64  
 14  marst      43143 non-null  int64  
 15  race       43143 non-null  int64  
 16  raced      43143 non-null  int64  
 17  hispan     43143 non-null  int64  
 18  hispand    43143 non-null  int64  
 19  rachsing   43143 non-null  int64  
 20  educ       43143 non-null  int64  
 21  educd      43143 non-null  int64  
 22  empstat    43143 non-null  int64  
 23  empstatd   43143 non-null  int64  
 24  labforce   43143 non-null  int64  
 25  classwkr   43143 non-null  int64  
 26  classwkrd  43143 non-null  int64  
 27  occ1990    43143 non-null  int64  
 28  occ2010    43143 non-null  int64  
 29  inctot     43143 non-null  int64  
 30  ftotinc    43143 non-null  int64  
 31  incwage    43143 non-null  int64  
 32  incbus00   43143 non-null  int64  
 33  incss      43143 non-null  int64  
 34  incwelfr   43143 non-null  int64  
 35  incinvst   43143 non-null  int64  
 36  incretir   43143 non-null  int64  
 37  incsupp    43143 non-null  int64  
 38  incother   43143 non-null  int64  
 39  incearn    43143 non-null  int64  
 40  erscor90   43143 non-null  float64
dtypes: float64(4), int64(37)
memory usage: 13.5 MB


# How many unique values are there per variable?
for var in df19.columns:
    print(f"{var:<10}:" + f"  {df19[var].nunique()}")

year      :  1
sample    :  1
serial    :  24481
cbserial  :  24481
hhwt      :  692
cluster   :  24481
cpi99     :  1
statefip  :  1
strata    :  43
gq        :  5
pernum    :  14
perwt     :  817
sex       :  2
age       :  73
marst     :  6
race      :  9
raced     :  82
hispan    :  5
hispand   :  23
rachsing  :  5
educ      :  11
educd     :  24
empstat   :  3
empstatd  :  5
labforce  :  2
classwkr  :  3
classwkrd :  9
occ1990   :  312
occ2010   :  421
inctot    :  3570
ftotinc   :  4398
incwage   :  770
incbus00  :  445
incss     :  339
incwelfr  :  124
incinvst  :  464
incretir  :  570
incsupp   :  176
incother  :  378
incearn   :  1069
erscor90  :  108


df19.perwt.describe()

count    43143.000000
mean        97.375681
std        111.292538
min          1.000000
25%         26.000000
50%         59.000000
75%        131.000000
max       2408.000000
Name: perwt, dtype: float64


df19.perwt.value_counts().sort_index().head(25)

1.0       2
2.0      38
3.0      49
4.0      28
5.0      52
6.0      60
7.0      99
8.0     113
9.0     170
10.0    196
11.0    275
12.0    353
13.0    377
14.0    488
15.0    651
16.0    587
17.0    795
18.0    802
19.0    687
20.0    829
21.0    810
22.0    672
23.0    768
24.0    632
25.0    617
Name: perwt, dtype: int64


df19.perwt.value_counts().sort_index().tail(10)

1325.0    1
1382.0    1
1393.0    1
1414.0    1
1422.0    1
1519.0    1
1531.0    1
1852.0    1
1931.0    1
2408.0    1
Name: perwt, dtype: int64


modewt = df19.perwt.value_counts().max()
modewt

829


maxwt = df19.perwt.value_counts().sort_index().index[-1]


plt.figure(figsize=(14,6))

plt.hist(data=df19, x='perwt', bins='auto', color='mediumaquamarine')

plt.title(f'Distribution of 2019 ACS Person Weights for Persons {8805:c} 20 Years Old in Minnesota')

plt.xlabel('Weights')

plt.ylabel('Number of People')

ax = plt.gca()
ax.annotate(f"max = {maxwt:.0f}", 
            xy=(maxwt, 10), 
            xytext=(maxwt, 1000),
            arrowprops=dict(facecolor='black', shrink=0.05, width=2, headwidth=5),
            horizontalalignment='center');


xticks = [1, 3, 10, 30, 100, 300, 1000]
xlabels = [f"{v:n}" for v in xticks]
#print(xlabels)

plt.figure(figsize=(14,6))

sb.histplot(data=df19, x='perwt', bins='auto', log_scale=[True, False], element='step', color='mediumaquamarine')

plt.title(f'Distribution of 2019 Log-Scaled ACS Person Weights for Persons {8805:c} 20 Years Old in Minnesota" ')

plt.xlabel('Weights (Log Scale)')

plt.ylabel('Number of People')

plt.xticks(ticks=xticks, labels=xlabels)

ax = plt.gca()
ax.annotate(f"max = {maxwt:.0f}", 
            xy=(maxwt, 10), 
            xytext=(maxwt, 500),
            arrowprops=dict(facecolor='black', shrink=0.05, width=2, headwidth=5),
            horizontalalignment='center');


# Create Pandas series indexed by ages of the number of people per age using the weightings
ages_2019dist = df19[['age', 'perwt']].groupby(by='age').perwt.sum()
ages_2019dist

age
20    71400.0
21    72921.0
22    64342.0
23    72242.0
24    74287.0
       ...   
88    11601.0
89    11724.0
90     6918.0
91     7473.0
95    31285.0
Name: perwt, Length: 73, dtype: float64


# Check that the total sum with the new series object matches population expectations
totpop_2019 = ages_2019dist.sum()

f"The total population of adults 20 years and older according to the 2019 sample is: {totpop_2019:,.0f}"

'The total population of adults 20 years and older according to the 2019 sample is: 4,201,079'


# Create bar chart with population at each age using the person weighted sums

x = ages_2019dist.index
y = ages_2019dist.values

x_ticks = np.arange(20, 100, 5)

y_ticks = [y * 1e4 for y in range(9)]
y_labels = [f"{num/1e3:.0f}k" for num in y_ticks]

plt.figure(figsize=(14,6))

plt.bar(x=x, height=y)

plt.xticks(ticks=x_ticks)
plt.yticks(ticks=y_ticks, labels=y_labels)

ax = plt.gca()
ax.tick_params(labelright=True)   # add y-ticks on the right hand side

plt.grid(visible=True, axis='y')  # put in horizontal grid lines

ax.text(x=65, y=75000, 
        s=f"Total Population ({8805:c} 20 Years Old) = \n {totpop_2019:,.0f}",
        verticalalignment='bottom', horizontalalignment='left',
        color='green', fontsize=15)

ax.annotate('top-coded age', xy=(95, ages_2019dist.iloc[-1]), xytext=(85, 4.5e4),
            arrowprops=dict(facecolor='black', shrink=0.05, width=3, headwidth=7),
            horizontalalignment='left')

plt.title(f"2019 Distribution of Minnesota Population by Age ({8805:c} 20 Years Old)")
plt.xlabel('Age in Years')
plt.ylabel(f"Number of People");


# Create bins for age
step=10
bin_edges = np.arange(20,100+step, step)
bin_edges

array([ 20,  30,  40,  50,  60,  70,  80,  90, 100])


bin_starts = bin_edges[0:-1]
bin_starts

array([20, 30, 40, 50, 60, 70, 80, 90])


# Create a new variable `agecat` with age categories using pandas cut method

df19['agecat'] = pd.cut(x=df19.age, bins=bin_edges,
       labels=[f"{value}s" for value in bin_starts],
       ordered=True, right=False, include_lowest=True)


# Check on the newly created age categories associated with `agecat`
df19.agecat.unique()

['20s', '70s', '90s', '30s', '60s', '50s', '80s', '40s']
Categories (8, object): ['20s' < '30s' < '40s' < '50s' < '60s' < '70s' < '80s' < '90s']


# Create Pandas series indexed by age category of the number of people per category using the weightings

agecat_2019dist = df19[['agecat', 'perwt']].groupby(by='agecat').perwt.sum()
agecat_2019dist

agecat
20s    733893.0
30s    765931.0
40s    680552.0
50s    725165.0
60s    676149.0
70s    394862.0
80s    178851.0
90s     45676.0
Name: perwt, dtype: float64


# Check that the series total matches population expectations

tot2019 = agecat_2019dist.sum()

f"The total population of adults 20 years and older according to the 2019 sample is: {tot2019:,.0f}"

'The total population of adults 20 years and older according to the 2019 sample is: 4,201,079'


# Create bar chart with population in each age category

x = agecat_2019dist.index
y = agecat_2019dist.values

y_ticks = [y * 1e5 for y in range(9)]
y_labels = [f"{num/1e3:.0f}k" for num in y_ticks]

plt.figure(figsize=(14,6))

plt.bar(x=x, height=y)

plt.yticks(ticks=y_ticks, labels=y_labels) 

ax = plt.gca()
ax.tick_params(labelright=True)   # add y-ticks on the right hand side

plt.grid(visible=True, axis='y')   # put in horizontal grid lines

ax.text(x=4.7, y=680000, 
        s=f"Total Population ({8805:c} 20 Years Old) = \n {tot2019:,.0f}",
        verticalalignment='bottom', horizontalalignment='left',
        color='green', fontsize=15)

ax.annotate('90s and older because top-coding', 
            xy=(7, agecat_2019dist.iloc[-1]), 
            xytext=(5.8, agecat_2019dist.iloc[-1] + 3e5),
            arrowprops=dict(facecolor='black', shrink=0.05, width=3, headwidth=7),
            horizontalalignment='left')

plt.title(f"2019 Distribution of Minnesota Population by Age Categories ({8805:c} 20 Years Old)")
plt.xlabel('Age Categories')
plt.ylabel(f"Number of People");


sex_dict = {1: 'Male', 2: 'Female'}


# Create pandas series with person weight sums by sex

sex_2019dist = df19[['sex', 'perwt']].groupby(by='sex').perwt.sum()
sex_2019dist

sex
1    2074299.0
2    2126780.0
Name: perwt, dtype: float64


f"The sex ratio (i.e., the ratio of males to females) is: {sex_2019dist[1]/sex_2019dist[2]:.2f}"

'The sex ratio (i.e., the ratio of males to females) is: 0.98'


# Create pie chart showing population proportion by sex

plt.pie(x=sex_2019dist, labels=['Males', 'Females'], 
        colors=['cornflowerblue', 'violet'], 
        startangle=90, autopct='%1.1f%%')

plt.title(f"2019 Sex Distribution (of Persons {8805:c} 20 Years Old)\n in Minnesota");


# Create pandas series grouped by marital status and with sums using person weightings

marst_2019dist = df19[['marst', 'perwt']].groupby(by='marst').perwt.sum()
marst_2019dist

marst
1    2252007.0
2      72496.0
3      39673.0
4     494994.0
5     224733.0
6    1117176.0
Name: perwt, dtype: float64


# Check that the series sum matches with population expectations

marst_2019dist.sum()

4201079.0


# Create a marriage status dictionary to match the encodings with their interpretation

marst_dict = {
1 : 'Married,\n spouse present',
2 : 'Married,\n spouse absent',
3 : 'Separated',
4 : 'Divorced',
5 : 'Widowed',
6 : 'Never\n married/Single'}
marst_dict

{1: 'Married,\n spouse present',
 2: 'Married,\n spouse absent',
 3: 'Separated',
 4: 'Divorced',
 5: 'Widowed',
 6: 'Never\n married/Single'}


# Create horizontal bar chart of distribution of people by marriage status

y = marst_2019dist.index
x = marst_2019dist.values

tot = marst_2019dist.sum()

marst_list = list(marst_dict.values())  # for tick labels

plt.figure(figsize=(8,6))

plt.barh(y=y, width=x, tick_label=marst_list, color='mediumorchid')

plt.grid(visible=True, axis='x')

plt.title(f"2019 Distribution by Marital Status\n (of Persons {8805:c} 20 Years Old) in Minnesota");

locs, labels = plt.xticks()
x_labels = [f"{num/1e6:.1f}" for num in locs]
plt.xticks(ticks=locs, labels=x_labels)

plt.ylabel('Marital Status')
plt.xlabel(f"Number of People (in Millions)")

# Adding text to the bars
for i in y:
    x_loc = marst_2019dist[i]
    x_prop = x_loc/tot
    pct_string = f"{x_prop:0.1%}"   # Create the string value
    plt.text(x=x_loc + .01 * tot, y=i, s=pct_string, va='center');   # Print the string value next to the bar


# Create pandas series grouped by racial categories using the person weights

rachsing_2019dist = df19[['rachsing', 'perwt']].groupby(by='rachsing').perwt.sum()
rachsing_2019dist

rachsing
1    3525519.0
2     249554.0
3      35280.0
4     207922.0
5     182804.0
Name: perwt, dtype: float64


# Create a racial category dictionary to match encodings with their interpretation

rachsing_dict = {
1 : 'White',
2 : 'Black/African\n American',
3 : 'American Indian/\nAlaska Native',
4 : 'Asian/Pacific\n Islander',
5 : 'Hispanic/Latino'}
rachsing_dict

{1: 'White',
 2: 'Black/African\n American',
 3: 'American Indian/\nAlaska Native',
 4: 'Asian/Pacific\n Islander',
 5: 'Hispanic/Latino'}


# Create horizontal bar chart of distribution of people by single racial/ethnicity categories

y = rachsing_2019dist.index
x = rachsing_2019dist.values

tot = rachsing_2019dist.sum()

rachsing_list = list(rachsing_dict.values())   # for tick labels

plt.figure(figsize=(8,6))

plt.barh(y=y, width=x, tick_label=rachsing_list, color='darkgreen')

plt.grid(visible=True, axis='x')

plt.title(f"2019 Distribution by Race/Ethnicity\n (of Persons {8805:c} 20 Years Old) in Minnesota");

locs, labels = plt.xticks()
x_labels = [f"{num/1e6:.1f}" for num in locs]
plt.xticks(ticks=locs, labels=x_labels)

plt.ylabel('Race/Ethnicity')
plt.xlabel(f"Number of People (in Millions)")

# Adding text to the bars
for i in y:
    x_loc = rachsing_2019dist[i]
    x_prop = x_loc/tot
    pct_string = f"{x_prop:0.1%}"   # Create the string value
    plt.text(x=x_loc + .01 * tot, y=i, s=pct_string, va='center');   # Print the string value next to the bar


df19.educ.value_counts().sort_index()

0       401
1        42
2       459
3       227
4       418
5       482
6     15877
7      6518
8      5382
10     9042
11     4295
Name: educ, dtype: int64


# Create a mapping of the `educ` encodings to the new, collapsed education categories

edudict = {0: 'NA/None',
           1: 'Grades 0-8',
           2: 'Grades 0-8',
           3: 'Grades 9-11',
           4: 'Grades 9-11',
           5: 'Grades 9-11',
           6: 'Grade 12',
           7: '1yr College',
           8: '2yr College',
           10: '4yr College',
           11: '5+yrs College'}

edulist1 = list(edudict.values())

for n, value in enumerate(edulist1):
    print(n, value)

0 NA/None
1 Grades 0-8
2 Grades 0-8
3 Grades 9-11
4 Grades 9-11
5 Grades 9-11
6 Grade 12
7 1yr College
8 2yr College
9 4yr College
10 5+yrs College


edulist2 = [value for n, value in enumerate(edulist1) if value not in edulist1[:n]]
edulist2

['NA/None',
 'Grades 0-8',
 'Grades 9-11',
 'Grade 12',
 '1yr College',
 '2yr College',
 '4yr College',
 '5+yrs College']


# Create new ordered categorical type and new variable `educat`

edu_categories = edulist2

edu_type = pd.CategoricalDtype(categories=edu_categories, ordered=True)

df19['educat'] = df19['educ'].map(edudict).astype(edu_type)


# Create pandas series grouped by new educational categories using the person weights

educat_2019dist = df19[df19['year'] == 2019][['educat', 'perwt']].groupby(by='educat').perwt.sum()
educat_2019dist

educat
NA/None            50984.0
Grades 0-8         56645.0
Grades 9-11       106422.0
Grade 12         1375531.0
1yr College       627751.0
2yr College       474178.0
4yr College      1017663.0
5+yrs College     491905.0
Name: perwt, dtype: float64


# Create horizontal bar chart of distribution of people by educational attainment categories

y = educat_2019dist.index
x = educat_2019dist.values

tot = educat_2019dist.sum()

educat_list = list(y)   # for tick labels

plt.figure(figsize=(8,6))

plt.barh(y=y, width=x, tick_label=educat_list, color='firebrick')

plt.grid(visible=True, axis='x')

plt.title(f"2019 Distribution by Educational Attainment\n (of Persons {8805:c} 20 Years Old) in Minnesota");

locs, labels = plt.xticks()
x_labels = [f"{num/1e6:.1f}" for num in locs]
plt.xticks(ticks=locs, labels=x_labels)

plt.ylabel('Educational Attainment')
plt.xlabel(f"Number of People (in Millions)")

# Adding text to the bars
for i in y:
    x_loc = educat_2019dist[i]
    x_prop = x_loc/tot
    pct_string = f"{x_prop:0.1%}"   # Create the string value
    plt.text(x=x_loc + .01 * tot, y=i, s=pct_string, va='center');   # Print the string value next to the bar


# Create Labor Force Dictionary
labforce_dict = {1: 'No, not in the labor force', 2: 'Yes, in the labor force'}


# Check that `empstat` and `labforce` agree on who is not in the labor force using XOR
# Is there anyone who is one or the other but not both?

np.logical_xor(df19.empstat == 3, df19.labforce == 1).sum()

0


# Check that `empstat` and `labforce` agree on N/A using XOR
# Is there anyone who is one or the other but not both?

np.logical_xor(df19.empstat == 0, df19.labforce == 0).sum()

0


# Is it possible to be unemployed (`empstat`) and self-employed (`classwkr`)?

np.logical_and(df19.empstat == 2, df19.classwkr == 1).sum()

40


# Is it possible to be unemployed (`empstat`) and work for wages (`classwkr`)?

np.logical_and(df19.empstat == 2, df19.classwkr == 2).sum()

619


# Examine the combinations of `empstat` values with `classwkr` values

df_empclass = df19[['empstat', 'classwkr', 'perwt']].groupby(by=['empstat', 'classwkr']).perwt.sum().to_frame()
df_empclass


(df19.empstat == 0).sum()

0


# Feature engineering: Create a new variable `empclass` that combines `empstat` with `classwkr`

conditions = [
    (df19['empstat'] == 1) & (df19['classwkr'] == 0),
    (df19['empstat'] == 1) & (df19['classwkr'] == 1),
    (df19['empstat'] == 1) & (df19['classwkr'] == 2),
    (df19['empstat'] == 2) & (df19['classwkr'] == 0),
    (df19['empstat'] == 2) & (df19['classwkr'] == 1),
    (df19['empstat'] == 2) & (df19['classwkr'] == 2),
    (df19['empstat'] == 3) & (df19['classwkr'] == 0),
    (df19['empstat'] == 3) & (df19['classwkr'] == 1),
    (df19['empstat'] == 3) & (df19['classwkr'] == 2),
]

values = [
    'Employed & N/A',
    'Employed & Self-Employed',
    'Employed & Works for Wages',
    'Unemployed & N/A',
    'Unemployed & Self-Employed',
    'Unemployed & Works for Wages',
    'Not in Labor Force & N/A',
    'Not in Labor Force & Self-Employed',
    'Not in Labor Force & Works for Wages'
]

df19['empclass'] = np.select(conditions, values)

s_empclass = df19[['empclass', 'perwt']].groupby(by='empclass').perwt.sum()
s_empclass.sort_values(ascending=False)

empclass
Employed & Works for Wages              2614669.0
Not in Labor Force & N/A                 796254.0
Not in Labor Force & Works for Wages     389574.0
Employed & Self-Employed                 261296.0
Unemployed & Works for Wages              72575.0
Not in Labor Force & Self-Employed        55871.0
Unemployed & N/A                           6320.0
Unemployed & Self-Employed                 4520.0
Name: perwt, dtype: float64


# What is the unweighted break down of cases by 'empclass'

df19.empclass.value_counts()

Employed & Works for Wages              23957
Not in Labor Force & N/A                 9609
Not in Labor Force & Works for Wages     4604
Employed & Self-Employed                 3391
Not in Labor Force & Self-Employed        870
Unemployed & Works for Wages              619
Unemployed & N/A                           53
Unemployed & Self-Employed                 40
Name: empclass, dtype: int64


color_list = ['seagreen'] * 2 + ['chocolate'] * 6
color_list

['seagreen',
 'seagreen',
 'chocolate',
 'chocolate',
 'chocolate',
 'chocolate',
 'chocolate',
 'chocolate']


# Create horizontal bar chart of employment status and worker classification

y = s_empclass.index
x = s_empclass.values

tot = s_empclass.sum()

plt.figure(figsize=(10,6))

plt.barh(y=y, width=x, color=color_list)

plt.grid(visible=True, axis='x')

plt.title('2019 Distribution by Employment Status and Worker Classification\n' + 
          f"(of Persons {8805:c} 20 Years Old) in Minnesota");

locs, labels = plt.xticks()
x_labels = [f"{num/1e6:.1f}" for num in locs]
plt.xticks(ticks=locs, labels=x_labels)

plt.ylabel('Employment Status & Worker Classification')
plt.xlabel(f"Number of People (in Millions)")

# Adding text to the bars
for i in y:
    x_loc = s_empclass[i]
    x_prop = x_loc/tot
    pct_string = f"{x_prop:0.1%}"   # Create the string value
    plt.text(x=x_loc + .01 * tot, y=i, s=pct_string, va='center');   # Print the string value next to the bar


# Create a list of the income variables for easy recall
income_cols = [col for col in df19.columns if 'inc' in col]
income_cols

['inctot',
 'ftotinc',
 'incwage',
 'incbus00',
 'incss',
 'incwelfr',
 'incinvst',
 'incretir',
 'incsupp',
 'incother',
 'incearn']


# Examine some rows of income variables
df19[income_cols + ['empclass', 'age']].head(10)


# Get basic statistics for `incwage`
df19['incwage'].describe()

count     43143.000000
mean      34424.420601
std       53030.538372
min           0.000000
25%           0.000000
50%       19500.000000
75%       50000.000000
max      476000.000000
Name: incwage, dtype: float64


# Create masks as discussed above

mask_wageworker = df19['empclass'] == 'Employed & Works for Wages'
mask_zerowage = df19['incwage'] == 0


# Are there any employed, wage workers who reported $0 wage earnings?
# We would expect not!

df19.loc[mask_wageworker & mask_zerowage, income_cols + ['empclass', 'age']]


# Given the two masks, there are three cases of interest 

print(f"The number of (unweighted) cases of people with zero wage income is:",
      f"{  mask_zerowage.sum()  :,}")
print(f"The number of (unweighted) cases of wageworkers with non-zero wage income is:",
      f"{  (mask_wageworker & (~mask_zerowage)).sum() :,}")
print(f"The number of (unweighted) cases of non-wageworkers with non-zero wage income is:",
      f"{  ((~mask_wageworker) & (~mask_zerowage)).sum()  :,}")

The number of (unweighted) cases of people with zero wage income is: 15,450
The number of (unweighted) cases of wageworkers with non-zero wage income is: 23,955
The number of (unweighted) cases of non-wageworkers with non-zero wage income is: 3,738


# Create bins
scale = 10000    # Set the rough magnitude for the bins
mult = 1        # Fine-tuning adjustment
step = mult * scale       # Create the step size from the previous
upper = (df19['incwage'].max()//scale) * scale   # Create an upper rounded number in scale magnitude
bin_edges = np.arange(0, upper + 2 * step, step).tolist()    # Create the bins


plt.figure(figsize=(9,12))

########## Plot 1
ax1 = plt.subplot(2,1,1)

plt.hist(data=df19[mask_wageworker & (~mask_zerowage)], x='incwage', 
         weights='perwt', bins=bin_edges, color="slateblue")

plt.title('Distribution of Reported (Non-Zero) Wage Income\n for Employed Wage Workers')

xlocs = [x*1e5 for x in np.arange(6)]
xlabels = [f"${num/1e3:.0f}k" for num in xlocs]
plt.xticks(ticks=xlocs, labels=xlabels)

ylocs, ylabels = plt.yticks()
ylabels = [f"{num/1e3:.0f}k" for num in ylocs]
plt.yticks(ticks=ylocs, labels=ylabels)

plt.ylabel('Number of People')


########## Plot 2
plt.subplot(2,1,2, sharey=ax1, sharex=ax1)

plt.hist(data=df19[(~mask_wageworker) & (~mask_zerowage)], x='incwage', 
         weights='perwt', bins=bin_edges, color='hotpink')

plt.title('Distribution of Reported (Non-Zero) Wage Income\n for People who did not Identify as an Employed Wage Worker')

plt.xlabel('Reported Wage Income')
plt.ylabel('Number of People');


# Check the claim that `incearn` is the sum of `incwage` and `incbus00`
(df19.incwage + df19.incbus00 != df19.incearn).sum()

0


mask_selfemployed = df19['empclass'] == 'Employed & Self-Employed'
mask_zerobusinc = df19['incbus00'] == 0
mask_employed = df19['empstat'] == 1


# Given these three additional masks, we can explore the employment terrain further

print(f"The number of (unweighted) cases of people who identified as employed is:",
      f"{  mask_employed.sum()  :,}")
print()
print("EMPLOYED analysis:")
print(f"The number of (unweighted) cases of people who identified as a wage-worker is:",
      f"{  mask_wageworker.sum()  :,}")
print(f"The number of (unweighted) cases of people who identified as self-employed is:",
      f"{  mask_selfemployed.sum()  :,}")
print()
print("WAGEWORKER analysis")
print(f"The number of (unweighted) cases of wageworkers with non-zero wage income is:",
      f"{  (mask_wageworker & (~mask_zerowage)).sum() :,}")
print(f"The number of (unweighted) cases of wageworkers with zero wage income is:",
      f"{  ((mask_wageworker) & (mask_zerowage)).sum()  :,}")
print(f"The number of (unweighted) cases of wageworkers with non-zero business income is:",
      f"{  (mask_wageworker & (~mask_zerobusinc)).sum() :,}")
print(f"The number of (unweighted) cases of wageworkers with zero business income is:",
      f"{  ((mask_wageworker) & (mask_zerobusinc)).sum()  :,}")

print()
print("SELF-EMPLOYED analysis")
print(f"The number of (unweighted) cases of self-employed with non-zero wage income is:",
      f"{  (mask_selfemployed & (~mask_zerowage)).sum() :,}")
print(f"The number of (unweighted) cases of self-employed with zero wage income is:",
      f"{  ((mask_selfemployed) & (mask_zerowage)).sum()  :,}")
print(f"The number of (unweighted) cases of self-employed with non-zero business income is:",
      f"{  (mask_selfemployed & (~mask_zerobusinc)).sum() :,}")
print(f"The number of (unweighted) cases of self-employed with zero business income is:",
      f"{  ((mask_selfemployed) & (mask_zerobusinc)).sum()  :,}")

The number of (unweighted) cases of people who identified as employed is: 27,348

EMPLOYED analysis:
The number of (unweighted) cases of people who identified as a wage-worker is: 23,957
The number of (unweighted) cases of people who identified as self-employed is: 3,391

WAGEWORKER analysis
The number of (unweighted) cases of wageworkers with non-zero wage income is: 23,955
The number of (unweighted) cases of wageworkers with zero wage income is: 2
The number of (unweighted) cases of wageworkers with non-zero business income is: 1,103
The number of (unweighted) cases of wageworkers with zero business income is: 22,854

SELF-EMPLOYED analysis
The number of (unweighted) cases of self-employed with non-zero wage income is: 1,642
The number of (unweighted) cases of self-employed with zero wage income is: 1,749
The number of (unweighted) cases of self-employed with non-zero business income is: 2,316
The number of (unweighted) cases of self-employed with zero business income is: 1,075


plt.figure(figsize=(12,12))

scale = 10000
mult = 1
step = mult * scale
upper = (df19['incwage'].max()//scale) * scale
bin_edges = np.arange(0, upper + 2 * step, step).tolist()


########### Plot 1
ax1 = plt.subplot(2,2,1)

plt.hist(data=df19[mask_wageworker & (~mask_zerowage)], 
         x='incwage', weights='perwt', bins=bin_edges,
         color='darkblue')

plt.title('Wage Worker - Wage Income')

plt.xlabel('Wage Income')
plt.ylabel('Number of People')

xlocs, xlabels = plt.xticks()  # Get locations and labels
xlocs = xlocs[1:-1]    # Remove first and last
xlabels = [f"${num/1e3:.0f}k" for num in xlocs]
plt.xticks(ticks=xlocs, labels=xlabels)

ylocs, ylabels = plt.yticks()   # Get locations and labels
ylabels = [f"{num/1e3:.0f}k" for num in ylocs]
plt.yticks(ticks=ylocs, labels=ylabels)


########## Plot 2
ax2 = plt.subplot(2,2,2, sharey=ax1, sharex=ax1)

plt.hist(data=df19[mask_wageworker & (~mask_zerobusinc)], 
         x='incbus00', weights='perwt', bins=bin_edges,
         color='cornflowerblue')

plt.title('Wage Worker - Business Income')

plt.xlabel('Business Income')



########### Plot 3
ax3 = plt.subplot(2,2,3, sharex=ax1)

plt.hist(data=df19[mask_selfemployed & (~mask_zerowage)], 
         x='incwage', weights='perwt', bins=bin_edges,
         color='darkgreen')

plt.ylim(0, ax1.get_ylim()[1] / 7)

ylocs, ylabels = plt.yticks()
ylocs = ylocs[:-1]    # Drop the last one
ylabels = [f"{num/1e3:.0f}k" for num in ylocs]
plt.yticks(ticks=ylocs, labels=ylabels)

plt.title('Self-Employed - Wage Income')

plt.xlabel('Wage Income')
plt.ylabel('Number of People')

############ Plot 4
ax4 = plt.subplot(2,2,4, sharex=ax1, sharey=ax3)

plt.hist(data=df19[mask_selfemployed & (~mask_zerobusinc)], 
         x='incbus00', weights='perwt', bins=bin_edges,
         color='lightgreen')

plt.title('Self-Employed - Business Income')

plt.xlabel('Business Income');


maxincwage = df19.incwage.value_counts().sort_index().index.max()
maxincwage_wt = df19[df19.incwage == maxincwage].perwt.sum()
maxincwage, maxincwage_wt

(476000, 31372.0)


plt.figure(figsize=(12,6))

scale = 10000
mult = 1
step = mult * scale
upper = (df19['incwage'].max()//scale) * scale
bin_edges = np.arange(0, upper + 2 * step, step).tolist()


plt.hist(data=df19[mask_employed & (~mask_zerowage)],
         x='incwage', 
         weights='perwt', 
         bins=bin_edges,
         alpha=0.5, 
         color='magenta',
         label='All Employed')

plt.hist(data=df19[mask_wageworker & (~mask_zerowage)], 
         x='incwage', 
         weights='perwt', 
         bins=bin_edges, 
         alpha=0.5,
         color='blue',
         label='Employed & Works for Wages')

plt.title('Wage Income Distribution - All Employed versus Employed Wage Workers Alone')

plt.xlabel('Wage Income')
plt.ylabel('Number of People')

xlocs, xlabels = plt.xticks()  # Get locations and labels
xlocs = xlocs[1:-1]    # Remove first and last
xlabels = [f"${num/1e3:.0f}k" for num in xlocs]
plt.xticks(ticks=xlocs, labels=xlabels)

ylocs, ylabels = plt.yticks()
ylocs = ylocs[:-1]    # Drop the last one
ylabels = [f"{num/1e3:.0f}k" for num in ylocs]
plt.yticks(ticks=ylocs, labels=ylabels)

plt.legend(loc='upper right')

ax = plt.gca()
ax.annotate(f"top-code = ${maxincwage/1e3:,.0f}k", 
            xy=(maxincwage, maxincwage_wt), 
            xytext=(maxincwage, maxincwage_wt * 3),
            arrowprops=dict(facecolor='black', shrink=0.05, width=2, headwidth=5),
            horizontalalignment='right');


df19.columns

Index(['year', 'sample', 'serial', 'cbserial', 'hhwt', 'cluster', 'cpi99',
       'statefip', 'strata', 'gq', 'pernum', 'perwt', 'sex', 'age', 'marst',
       'race', 'raced', 'hispan', 'hispand', 'rachsing', 'educ', 'educd',
       'empstat', 'empstatd', 'labforce', 'classwkr', 'classwkrd', 'occ1990',
       'occ2010', 'inctot', 'ftotinc', 'incwage', 'incbus00', 'incss',
       'incwelfr', 'incinvst', 'incretir', 'incsupp', 'incother', 'incearn',
       'erscor90', 'agecat', 'educat', 'empclass'],
      dtype='object')


new_cols = ['perwt', 'sex', 'age', 'marst', 'rachsing', 'empstat', 'incwage',
       'incbus00', 'incearn', 'erscor90', 'agecat', 'educat', 'empstat', 'labforce', 'empclass']


df19wages = df19[mask_employed & (~mask_zerowage)][new_cols]
df19wages.shape

(25597, 15)


df19wages.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25597 entries, 0 to 43142
Data columns (total 15 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   perwt     25597 non-null  float64 
 1   sex       25597 non-null  int64   
 2   age       25597 non-null  int64   
 3   marst     25597 non-null  int64   
 4   rachsing  25597 non-null  int64   
 5   empstat   25597 non-null  int64   
 6   incwage   25597 non-null  int64   
 7   incbus00  25597 non-null  int64   
 8   incearn   25597 non-null  int64   
 9   erscor90  25597 non-null  float64 
 10  agecat    25597 non-null  category
 11  educat    25597 non-null  category
 12  empstat   25597 non-null  int64   
 13  labforce  25597 non-null  int64   
 14  empclass  25597 non-null  object  
dtypes: category(2), float64(2), int64(10), object(1)
memory usage: 2.8+ MB


# Create Log-Scaled Histogram of Wage Income

xticks = [100, 300, 1000, 3000, 10000, 30000, 100000, 300000]
xlabels = [f"${v/1e3:n}k" for v in xticks]
#print(xlabels)

plt.figure(figsize=(14,6))

sb.histplot(data=df19wages, 
            x='incwage', 
            weights='perwt',
            bins=100, 
            log_scale=[True, False], 
            element='step', 
            color='mediumaquamarine')

plt.title('Distribution of 2019 Log-Scaled Wage Income of Employed Wage Earners in Minnesota')

plt.xlabel('Wage Income (Log Scale)')

plt.ylabel('Number of People')

plt.xticks(ticks=xticks, labels=xlabels)

ylocs, ylabels = plt.yticks()
ylocs = ylocs[:-1]    # Drop the last one
ylabels = [f"{num/1e3:.0f}k" for num in ylocs]
plt.yticks(ticks=ylocs, labels=ylabels)

plt.xlim(1000,1e6);


# From Stackoverflow: https://stackoverflow.com/questions/58053594/how-to-create-a-boxplot-from-data-with-weights

def expand_df(df, weight_col):
    """expand the dataframe 
    so that rows are repeated as many times 
    as dictated by the values in weight_col"""
    df = df.reindex(df.index.repeat(df[weight_col]))
    df.reset_index(drop=True, inplace=True)
    return(df)


# Create expanded DataFrame as described above

df19wages_expanded = expand_df(df=df19wages, weight_col='perwt')
df19wages_expanded.shape

(2746113, 15)


# Check that the number of rows of the expanded DataFrame matches the sum of all the weights
df19wages.perwt.sum()

2746113.0


# Create Scatterplot of Wage Income by Ages-Numerical

import statistics
median_age = statistics.median(df19wages.age.unique().tolist())

plt.figure(figsize=(12,7))

plt.scatter(data=df19wages_expanded, x='age', y='incwage')

plt.title('Scatterplot of Wage Income by Age (in Years)')

plt.xlabel('Age')
plt.ylabel('Wage Income')

ylocs, ylabels = plt.yticks()
ylocs = ylocs[1:]    # Drop the last one
ylabels = [f"${num/1e3:.0f}k" for num in ylocs]
plt.yticks(ticks=ylocs, labels=ylabels)

plt.text(x=median_age, y=maxincwage - 2e4, s='Top-Coded Values', ha='center');


# Create Heatmap of Wage Income by Age

plt.figure(figsize=(13,14))

###### Plot 1
plt.subplot(2,1,1)

# Specify bin edges 
bins_x = np.arange(18, 100, 1)
bins_y = np.arange(0, 5e5, 1e4)

plt.hist2d(data=df19wages_expanded, x='age', y='incwage', cmin=0.5, cmap='viridis_r', bins = [bins_x, bins_y])
plt.colorbar()
plt.xlabel('Age')
plt.ylabel('Wage Income')

plt.title('Heatmap of Wage Income by Age (in Years)')

ylocs, ylabels = plt.yticks()
ylocs = ylocs[1:]    # Drop the last one
ylabels = [f"${num/1e3:.0f}k" for num in ylocs]
plt.yticks(ticks=ylocs, labels=ylabels)

plt.text(x=median_age, y=maxincwage - 2e4, s='Top-Coded Values', ha='center')



###### Plot 2
plt.subplot(2,1,2)

# Specify bin edges 
bins_x = np.arange(18, 100, 1)
bins_y = np.arange(0, 5e5, 2.5e3)

plt.hist2d(data=df19wages_expanded, x='age', y='incwage', cmin=0.5, cmap='viridis_r', bins = [bins_x, bins_y])
plt.colorbar()

# Title, Labels, and Ticks
plt.xlabel('Age')
plt.ylabel('Wage Income')

plt.title('Expanded View of Lower Wage Incomes')

ylocs, ylabels = plt.yticks()
ylocs = np.linspace(0,1.2e5, 13)
ylabels = [f"${num/1e3:.0f}k" for num in ylocs]
plt.yticks(ticks=ylocs, labels=ylabels)

# Change y limits
plt.ylim(0, 1.2e5);


# Create Boxplots of Wage Income by Age Groups

plt.figure(figsize=(12,7))

sb.boxplot(data=df19wages_expanded, x='agecat', y='incwage')

# Title, Labels, and Ticks
plt.title('Boxplots of Wage Income by Age Groups')

plt.xlabel('Age Group')
plt.ylabel('Wage Income')

xlocs, xlabels = plt.xticks()

ylocs, ylabels = plt.yticks()
ylocs = ylocs[1:]    # Drop the last one
ylabels = [f"${num/1e3:.0f}k" for num in ylocs]
plt.yticks(ticks=ylocs, labels=ylabels)

# Text
plt.text(x=(len(xlocs)/2 - 0.5), y=3.2e5, s='Top-Coded Values Not Shown', ha='center')

# Change y limits
plt.ylim(-1e4, 3.3e5);


# Investigate the sample size of wage earners in their 90s

samp90s = (df19wages['agecat'] == '90s').sum()
pop90s = df19wages[df19wages['agecat'] == '90s'].perwt.sum()

f"There are {samp90s} wage earners in the sample in their 90s with an estimated representation of {pop90s:.0f} people."

'There are 7 wage earners in the sample in their 90s with an estimated representation of 620 people.'


# Create Frequency-Scaled Violinplots of Wage Income by Age Groups

plt.figure(figsize=(12,7))

sb.violinplot(data=df19wages_expanded, x='agecat', y='incwage',
              scale='count',
              cut=0)

# Title, Labels, and Ticks
plt.title('Frequency-Scalded Violinplots of Wage Income by Age Groups')

plt.xlabel('Age Group')
plt.ylabel('Wage Income')

xlocs, xlabels = plt.xticks()

ylocs, ylabels = plt.yticks()
ylocs = ylocs[1:]    # Drop the last one
ylabels = [f"${num/1e3:.0f}k" for num in ylocs]
plt.yticks(ticks=ylocs, labels=ylabels)

# Change y limits
ylim_top = 2.1e5
plt.ylim(-1e4,ylim_top)

# Add text
plt.text(x=(len(xlocs)/2 - 0.5), y=ylim_top * (1 - 0.033), 
         s='Top-Coded Values and Some Outliers Not Shown', ha='center');


labforce_dist = df19[['perwt', 'agecat', 'labforce']].groupby(by=['agecat', 'labforce']).perwt.sum()
labforce_dist

agecat  labforce
20s     1            94518.0
        2           639375.0
30s     1            91504.0
        2           674427.0
40s     1            69515.0
        2           611037.0
50s     1           113994.0
        2           611171.0
60s     1           321226.0
        2           354923.0
70s     1           335021.0
        2            59841.0
80s     1           170880.0
        2             7971.0
90s     1            45041.0
        2              635.0
Name: perwt, dtype: float64


labforce_dist_list = labforce_dist.to_list()
labforce_dist_list_exp = []
x_labels = []

for x, y in enumerate(labforce_dist_list):
    labforce_dist_list_exp.append(labforce_dist_list[x])
    if x%2 != 0:
        labforce_dist_list_exp.append(0)

for i in range(len(df19.agecat.cat.categories)):
    x_labels.append(df19.agecat.cat.categories[i])
    x_labels.append('')
    x_labels.append('')


df19.agecat.cat.categories

Index(['20s', '30s', '40s', '50s', '60s', '70s', '80s', '90s'], dtype='object')


plt.figure(figsize=(12,7))


# Create Bars
plt.bar(height=labforce_dist_list_exp, 
        x=np.arange(0, len(x_labels)), 
        tick_label=x_labels, 
        color=['gold', 'darkcyan', 'blue'] * (len(df19.agecat.cat.categories))
        )

# Title, Labels, and Ticks
plt.xticks(rotation=90)

# Remove x_ticks: https://stackoverflow.com/questions/12998430/remove-xticks-in-a-matplotlib-plot
plt.tick_params(
    axis='x',          # changes apply to the x-axis
    which='both',      # both major and minor ticks are affected
    bottom=False,      # ticks along the bottom edge are off
    top=False,         # ticks along the top edge are off
    labelbottom=True) # labels along the bottom edge are on

ylocs, ylabels = plt.yticks()
ylabels = [f"{num/1e3:.0f}k" for num in ylocs]
plt.yticks(ticks=ylocs, labels=ylabels)

plt.ylabel('Number of People')

plt.title('Transfer of People Out of the Labor Force')

# Create Legend: 
# https://stackoverflow.com/questions/57340415/matplotlib-bar-plot-add-legend-from-categories-dataframe-column

colors = {'No, not in the labor force':'gold', 'Yes, in the labor force':'darkcyan'}         
labels = list(colors.keys())
handles = [plt.Rectangle((0,0),1,1, color=colors[label]) for label in labels]
plt.legend(handles, labels);


sex_2019dist = df19wages[['sex', 'perwt']].groupby(by='sex').perwt.sum()
sex_2019dist

sex
1    1419296.0
2    1326817.0
Name: perwt, dtype: float64


f"The sex ratio (i.e., the ratio of males to females) is: {sex_2019dist[1]/sex_2019dist[2]:.2f}"

'The sex ratio (i.e., the ratio of males to females) is: 1.07'


# Create pie chart showing proportion by sex of employed wage earners

plt.pie(x=sex_2019dist, labels=['Males', 'Females'], 
        colors=['cornflowerblue', 'violet'], 
        startangle=90, autopct='%1.1f%%')

plt.title(f"2019 Sex Distribution of Employed Wage Earners\n ({8805:c} 20 Years Old) in Minnesota");


# Create Count Frequency Histograms of Wage Income Distribution with Bins Clustered by Sex

scale = 10000
mult = 1
step = mult * scale
upper = (df19wages.incwage.max()//scale) * scale
bin_edges = np.arange(0, upper + 2 * step, step).tolist()

plt.figure(figsize=(12,7))

sb.histplot(data=df19wages, 
            x='incwage', 
            hue='sex', 
            hue_order=[1, 2], 
            bins=bin_edges, 
            palette='bright', 
            multiple='dodge', 
            weights='perwt')

plt.title('Wage Income Distribution by Sex')

plt.xlabel('Wage Income')
plt.ylabel('Number of People')

xlocs, xlabels = plt.xticks()  # Get locations and labels
xlocs = xlocs[1:-1]    # Remove first and last
xlabels = [f"${num/1e3:.0f}k" for num in xlocs]
plt.xticks(ticks=xlocs, labels=xlabels)

ylocs, ylabels = plt.yticks()
ylocs = ylocs[:-1]    # Drop the last one
ylabels = [f"{num/1e3:.0f}k" for num in ylocs]
plt.yticks(ticks=ylocs, labels=ylabels)

ax = plt.gca()

# From: https://stackoverflow.com/questions/66346542/customizing-legend-in-seaborn-histplot-subplots

legend = ax.get_legend()
handles = legend.legendHandles
legend.remove()
ax.legend(handles, ['Male', 'Female'], title='Sex');


# Create Overlapping Count Frequency Histograms by Sex of Wage Income Distribution

scale = 10000
mult = 1
step = mult * scale
upper = (df19wages.incwage.max()//scale) * scale
bin_edges = np.arange(0, upper + 2 * step, step).tolist()


plt.figure(figsize=(12,7))

sb.histplot(data=df19wages, 
            x='incwage', 
            hue='sex', 
            hue_order=[1, 2], 
            bins=bin_edges, 
            stat='count',
            palette='bright', 
            multiple='layer', 
            weights='perwt',
            element='step', 
            alpha=0.4)

plt.title('Wage Income - Count Distribution by Sex')

plt.xlabel('Wage Income')
plt.ylabel('Number of People')

xlocs, xlabels = plt.xticks()  # Get locations and labels
xlocs = xlocs[1:-1]    # Remove first and last
xlabels = [f"${num/1e3:.0f}k" for num in xlocs]
plt.xticks(ticks=xlocs, labels=xlabels)

ylocs, ylabels = plt.yticks()
ylocs = ylocs[:-1]    # Drop the last one
ylabels = [f"{num/1e3:.0f}k" for num in ylocs]
plt.yticks(ticks=ylocs, labels=ylabels)

ax = plt.gca()

# From: https://stackoverflow.com/questions/66346542/customizing-legend-in-seaborn-histplot-subplots

legend = ax.get_legend()
handles = legend.legendHandles
legend.remove()
ax.legend(handles, ['Male', 'Female'], title='Sex');


# Create Overlapping Relative Frequency Histograms by Sex of Wage Income Distribution

scale = 10000
mult = 1
step = mult * scale
upper = (df19wages.incwage.max()//scale) * scale
bin_edges = np.arange(0, upper + 2 * step, step).tolist()


plt.figure(figsize=(12,7))

sb.histplot(data=df19wages, 
            x='incwage', 
            hue='sex', 
            hue_order=[1, 2], 
            bins=bin_edges, 
            stat='proportion',
            common_norm=False,
            palette='bright', 
            multiple='layer', 
            weights='perwt',
            element='step', 
            alpha=0.4)

plt.title('Wage Income - Proportionate Distribution by Sex')

plt.xlabel('Wage Income')
plt.ylabel('Proportion')

xlocs, xlabels = plt.xticks()  # Get locations and labels
xlocs = xlocs[1:-1]    # Remove first and last
xlabels = [f"${num/1e3:.0f}k" for num in xlocs]
plt.xticks(ticks=xlocs, labels=xlabels)


ax = plt.gca()

# From: https://stackoverflow.com/questions/66346542/customizing-legend-in-seaborn-histplot-subplots

legend = ax.get_legend()
handles = legend.legendHandles
legend.remove()
ax.legend(handles, ['Male', 'Female'], title='Sex');


# Create Cumulative Distribution Curves by Sex of Wage Income

scale = 10000
mult = 1
step = mult * scale
upper = (df19wages.incwage.max()//scale) * scale
bin_edges = np.arange(0, upper + 2 * step, step).tolist()


plt.figure(figsize=(12,7))

sb.histplot(data=df19wages, 
            x='incwage', 
            hue='sex', 
            hue_order=[1, 2], 
            bins=bin_edges, 
            stat='proportion',
            cumulative=True,
            common_norm=False,
            palette='bright', 
            multiple='layer', 
            weights='perwt',
            element='step', 
            alpha=0.8,
            fill=False)

plt.title('Wage Income - Cumulative Distribution by Sex')

plt.xlabel('Wage Income')
plt.ylabel('Proportion')

xlocs, xlabels = plt.xticks()  # Get locations and labels
xlocs = xlocs[1:-1]    # Remove first and last
xlabels = [f"${num/1e3:.0f}k" for num in xlocs]
plt.xticks(ticks=xlocs, labels=xlabels)


ax = plt.gca()

# From: https://stackoverflow.com/questions/66346542/customizing-legend-in-seaborn-histplot-subplots

legend = ax.get_legend()
handles = legend.legendHandles
legend.remove()
ax.legend(handles, ['Male', 'Female'], title='Sex');


marst_dict.keys(), marst_dict.values()

(dict_keys([1, 2, 3, 4, 5, 6]),
 dict_values(['Married,\n spouse present', 'Married,\n spouse absent', 'Separated', 'Divorced', 'Widowed', 'Never\n married/Single']))


# Create boxplots of wage income by marital status

plt.figure(figsize=(12,7))

sb.boxplot(data=df19wages_expanded, x='marst', y='incwage', order=marst_dict.keys())


# Title, Labels, and Ticks
plt.title('Wage Income Distribution by Marital Status')

plt.xlabel('Marital Status')
plt.ylabel('Wage Income')

xlocs, xlabels = plt.xticks()  # Get locations and labels
#print(xlocs, xlabels)
plt.xticks(ticks=xlocs, labels=marst_dict.values())

ylocs, ylabels = plt.yticks()  # Get locations and labels
ylocs = ylocs[1:-1]    # Remove first and last
ylabels = [f"${num/1e3:.0f}k" for num in ylocs]
plt.yticks(ticks=ylocs, labels=ylabels)

# Change y limits
ylim_top = 2.1e5
plt.ylim(-1e4,ylim_top)

# Add text
plt.text(x=(len(xlocs)/2 - 0.5), y=ylim_top * (1 - 0.033), 
         s='Top-Coded Values and Some Outliers Not Shown', ha='center');


# order marital status by medians of incwage
marst_medianorder = (df19wages_expanded[['marst', 'incwage']]
                    .groupby(by='marst')
                    .incwage.median()
                    .sort_values(ascending=False))

marst_medianorder

marst
1    55000.0
4    46000.0
3    35200.0
2    35000.0
6    34000.0
5    31000.0
Name: incwage, dtype: float64


print(f"The high-low difference in the median wage incomes is ${marst_medianorder[1] - marst_medianorder[5]:,.2f}.")

The high-low difference in the median wage incomes is $24,000.00.


# create list of labels from the order of medians
list_marst_medianorder = [marst_dict[x] for x in marst_medianorder.keys()]

list_marst_medianorder

['Married,\n spouse present',
 'Divorced',
 'Separated',
 'Married,\n spouse absent',
 'Never\n married/Single',
 'Widowed']


# Create violinplots of wage income by marital status

plt.figure(figsize=(12,7))

sb.violinplot(data=df19wages_expanded, 
              x='marst', 
              y='incwage', 
              order=marst_medianorder.keys(),
              scale='count',
              inner='box',
              cut=0)

# Title, Labels, and Ticks
plt.title('Wage Income Distribution by Marital Status (Median-Ordered and Scaled)')

plt.xlabel('Marital Status')
plt.ylabel('Wage Income')

xlocs, xlabels = plt.xticks()  # Get locations and labels
# print(xlocs, xlabels)
plt.xticks(ticks=xlocs, labels=list_marst_medianorder)

ylocs = np.arange(0, 150000, 20000)
ylabels = [f"${num/1e3:.0f}k" for num in ylocs]
plt.yticks(ticks=ylocs, labels=ylabels)

# Add grid lines
plt.grid(visible=True, axis='y')  # put in horizontal grid lines

# Text
plt.text(x=(len(xlocs)/2 - 0.5), y=1.5e5, s='Upper Values Not Shown', ha='center')

# Change y limits
plt.ylim(-2e4,1.6e5);


rachsing_dict.keys(), rachsing_dict.values()

(dict_keys([1, 2, 3, 4, 5]),
 dict_values(['White', 'Black/African\n American', 'American Indian/\nAlaska Native', 'Asian/Pacific\n Islander', 'Hispanic/Latino']))


# order race by medians of incwage
race_medianorder = (df19wages_expanded[['rachsing', 'incwage']]
                    .groupby(by='rachsing')
                    .incwage.median()
                    .sort_values(ascending=False))

race_medianorder

rachsing
1    48000.0
4    40000.0
2    35000.0
5    33000.0
3    29200.0
Name: incwage, dtype: float64


# create list of labels from the order of medians
list_race_medianorder = [rachsing_dict[x] for x in race_medianorder.keys()]

list_race_medianorder

['White',
 'Asian/Pacific\n Islander',
 'Black/African\n American',
 'Hispanic/Latino',
 'American Indian/\nAlaska Native']


# Create boxplots of wage income by race/ethnicity

plt.figure(figsize=(12,7))

sb.boxplot(data=df19wages_expanded, x='rachsing', y='incwage', 
           order=race_medianorder.keys(),
           palette=['violet']*2 + ['greenyellow']*3)


# Title, Labels, and Ticks
plt.title('Wage Income Distribution by Race/Ethnicity ordered by Median Value')

plt.xlabel('Race/Ethnicity')
plt.ylabel('Wage Income')

xlocs, xlabels = plt.xticks()  # Get locations and labels
#print(xlocs, xlabels)
plt.xticks(ticks=xlocs, labels=list_race_medianorder)

ylocs, ylabels = plt.yticks()  # Get locations and labels
ylocs = ylocs[1:-1]    # Remove first and last
ylabels = [f"${num/1e3:.0f}k" for num in ylocs]
plt.yticks(ticks=ylocs, labels=ylabels)

# Change y limits
ylim_top = 2.1e5
plt.ylim(-1e4,ylim_top)

# Add text
plt.text(x=(len(xlocs)/2 - 0.5), y=ylim_top * (1 - 0.033), 
         s='Top-Coded Values Not Shown', ha='center');


# Create boxplots of wage income by educational attainment categories

plt.figure(figsize=(12,7))

sb.boxplot(data=df19wages_expanded, x='educat', y='incwage',
           palette="Paired")

plt.title('Wage Income Distribution by Educational Attainment')

plt.xlabel('Educational Attainment')
plt.ylabel('Wage Income')


xlocs, xlabels = plt.xticks()  # Get locations and labels

ylocs, ylabels = plt.yticks()  # Get locations and labels
ylocs = ylocs[1:-1]    # Remove first and last
ylabels = [f"${num/1e3:.0f}k" for num in ylocs]
plt.yticks(ticks=ylocs, labels=ylabels)

plt.text(x=(len(xlocs)/2 - 0.5), y=3e5, s='Top-Coded Values Not Shown', ha='center')

plt.ylim(-1e4,3.1e5);


racelist = [1, 4, 2, 3, 5]


educatlist = df19wages.educat.cat.categories.to_list()
educatlist

['NA/None',
 'Grades 0-8',
 'Grades 9-11',
 'Grade 12',
 '1yr College',
 '2yr College',
 '4yr College',
 '5+yrs College']


# Create table of proportions of educational attainment for each race/ethnicity
df_props = (df19wages_expanded[['rachsing', 'educat']]
 .groupby(by='rachsing')
 .educat
 .value_counts(normalize=True)).reset_index()

df_props.columns = ['rachsing', 'educat', 'prop']

df_props['perc'] = df_props['prop'] * 100
df_props


# Check that loop creates inner counts as expected for stacked bars
for level in educatlist:
    inner_counts = [df_props.loc[(df_props.rachsing == race) 
                                 & (df_props.educat == level)].perc.squeeze() 
                    for race in racelist]
    print(inner_counts)

[0.2331098740379488, 3.459642012833502, 3.2627983965277494, 1.3074842200180343, 3.118045013044055]
[0.17745038165251933, 1.4515366430260048, 1.6878349051938306, 1.3863841298467088, 8.449157663566554]
[0.947707356908905, 2.3519081391421817, 2.7019321956980864, 16.77750225428314, 9.496366825357052]
[26.578529603810015, 25.328605200945624, 37.22080324883492, 33.25067628494139, 33.51216708181644]
[15.311596358241214, 14.237757514353259, 17.859864903390214, 17.899008115419296, 14.019042846404409]
[13.258927078585042, 9.222559945964202, 13.320572269704497, 8.667718665464383, 8.643713060267956]
[29.996286433867724, 23.22323539344816, 14.438231991482281, 12.95649233543733, 16.224740961280528]
[13.496392912896635, 20.724755150287066, 9.507962089168418, 7.75473399458972, 6.536766548263004]


plt.figure(figsize=(6, 6))

# Initialize bars to zero
baselines = np.zeros(5)

# Create stacked bars
for i, level in enumerate(educatlist):
    inner_counts = [df_props.loc[(df_props.rachsing == race) 
                                 & (df_props.educat == level)].perc.squeeze() 
                    for race in racelist]
    plt.barh(y=np.arange(len(racelist)), 
            width=inner_counts,
            left=baselines,
            tick_label=racelist,
            zorder=2,
            color=sb.color_palette('Paired')[i])
    baselines += inner_counts
    
# Legend
plt.legend(labels=educatlist,
           loc=(-0.25, -0.27),
           title='Educational Attainment',
           title_fontsize='large',
           ncol=4
          )

# Titles, Labels, and Ticks
plt.title('Proportionate Distribution of\n Levels of Educational Attainment by Race/Ethnicity')
plt.xlabel('Percentage (%)')

locs, labels = plt.yticks()
y_labels = [rachsing_dict[num] for num in racelist]
plt.yticks(ticks=locs, labels=y_labels);


# What is the percentage attainment of a 4 year college degree and beyond
# for each of the racial/ethnic groups?
df_props[(df_props.educat == '4yr College') | 
         (df_props.educat == '5+yrs College')][['rachsing', 'perc']].groupby(by='rachsing').perc.sum()

rachsing
1    43.492679
2    23.946194
3    20.711226
4    43.947991
5    22.761508
Name: perc, dtype: float64


# Recall the `rachsing` dictionary values
rachsing_dict

{1: 'White',
 2: 'Black/African\n American',
 3: 'American Indian/\nAlaska Native',
 4: 'Asian/Pacific\n Islander',
 5: 'Hispanic/Latino'}


agecatlist = df19wages.agecat.cat.categories[:3].to_list()
agecatlist

['20s', '30s', '40s']


mask = df19wages_expanded['age'] < 50

df_props = (df19wages_expanded
 .loc[mask, ['agecat', 'educat']]
 .groupby(by='agecat', observed=True)
 .educat
 .value_counts(normalize=True).reset_index())

df_props.columns = ['agecat', 'educat', 'prop']

df_props['perc'] = df_props['prop'] * 100
df_props


df_props.loc[(df_props.agecat == '20s') & (df_props.educat == 'Grades 0-8'), 'perc'].squeeze()

0.3135214684618093


for level in educatlist:
    inner_counts = [df_props.loc[(df_props.agecat == agecat) 
                                 & (df_props.educat == level)].perc.squeeze() 
                    for agecat in agecatlist]
    print(inner_counts)

[0.569978358842757, 0.8493684567623646, 0.8277713887619838]
[0.3135214684618093, 1.1165021735185754, 1.175053781509663]
[1.442799434693595, 1.7717923836122225, 2.1187026622066085]
[28.590521606951867, 22.28223395107019, 23.918899758967815]
[22.96532242320893, 11.494481392945334, 12.954823531882939]
[10.890991638871322, 13.874070041734713, 13.95991198911942]
[29.185361434022838, 31.43767800367644, 28.20741720973301]
[6.041503634946881, 17.17387359668016, 16.837419677818563]


plt.figure(figsize=(6, 6))

baselines = np.zeros(3)

for i, level in enumerate(educatlist):
    inner_counts = [df_props.loc[(df_props.agecat == agecat) 
                                 & (df_props.educat == level)].perc.squeeze() 
                    for agecat in agecatlist]
    plt.barh(y=np.arange(len(agecatlist)), 
            width=inner_counts,
            left=baselines,
            tick_label=agecatlist,
            zorder=2,
            color=sb.color_palette('Paired')[i]
           )
    baselines += inner_counts
    
# Legend
plt.legend(labels=educatlist,
           loc=(-0.10,-0.27),
           title='Educational Attainment',
           title_fontsize='large',
           ncol=4
          )

title = '''Proportionate Distribution of 
Levels of Educational Attainment 
by Age Category (Under 50 Years Old)'''
plt.title(title)
plt.xlabel('Percentage (%)');


mask = df19wages_expanded['age'] < 50

fig = plt.figure(figsize=(10,12))

fig.suptitle('Wage Income by Educational Attainment\n and Age Category under 50', fontsize=20)


########### Plot 1
ax1 = plt.subplot(211)

# Barplot
ax = sb.barplot(data=df19wages_expanded[mask], 
                x='educat', y='incwage', hue='agecat',
                hue_order=df19wages_expanded.agecat.cat.categories.tolist()[:3],
                ci=None)

# Legend
ax.legend(loc='upper left', 
          bbox_to_anchor = (0., 1.15),
          ncol = 3, 
          framealpha = 1, 
          title = 'age category')

# Title, Labels, and Ticks
plt.title('Mean Wage Incomes')

plt.xlabel('Educational Attainment')
plt.ylabel('Wage Income')

ylocs, ylabels = plt.yticks()  # Get locations and labels
ylocs = ylocs[1:-1]    # Remove first and last
ylabels = [f"${num/1e3:.0f}k" for num in ylocs]
plt.yticks(ticks=ylocs, labels=ylabels)

# Add grid lines
plt.grid(visible=True, axis='y')  # put in horizontal grid lines


########### Plot 2
plt.subplot(212, sharey=ax1)

# Barplot
ax = sb.barplot(data=df19wages_expanded[mask], 
                x='educat', y='incwage', hue='agecat',
                hue_order=df19wages_expanded.agecat.cat.categories.tolist()[:3],
                estimator=np.median, ci=None)

# Legend
ax.legend().remove()

# Title, Labels, and Ticks
plt.title('Median Wage Incomes')

plt.xlabel('Educational Attainment')
plt.ylabel('Wage Income')

ylocs, ylabels = plt.yticks()  # Get locations and labels
ylocs = ylocs[1:-1]    # Remove first and last
ylabels = [f"${num/1e3:.0f}k" for num in ylocs]
plt.yticks(ticks=ylocs, labels=ylabels)

# Add grid lines
plt.grid(visible=True, axis='y')  # put in horizontal grid lines

# Add grid lines
plt.grid(visible=True, axis='y');  # put in horizontal grid lines


plt.figure(figsize=(12,7))

g = sb.FacetGrid(data=df19wages_expanded, 
                 col='rachsing', 
                 height=4, aspect=1.2, 
                 col_wrap=2,
                 col_order=racelist)
g.map(sb.boxplot, 'educat', 'incwage', width=0.5, order=edulist2)
g.set(ylim=(0, 2.5e5))


# Title,Labels, and Ticks
g.set_xlabels(label='Educational Attainment')
g.set_xticklabels(rotation=90)

ylocs, ylabels = plt.yticks()
ylabels = [f"{num/1e3:.0f}k" for num in ylocs]
plt.yticks(ticks=ylocs, labels=ylabels)

g.set_ylabels('Wage Income ($)')

# Add suptitle
# https://stackoverflow.com/questions/29813694/how-to-add-a-title-to-seaborn-facet-plot
g.fig.subplots_adjust(top=0.9)
g.fig.suptitle('The Impact of Educational Attainment on Wage Incomes by Race/Ethnicity', 
                fontsize=16)

# To change facet titles:
# https://stackoverflow.com/questions/43920341/python-seaborn-facetgrid-change-titles
axes = g.axes.flatten()
for i, j in enumerate(racelist):
    axes[i].set_title(rachsing_dict[j])

<Figure size 864x504 with 0 Axes>


# Create new variable `Race` with interpreted values rather than numbers
df19wages_expanded = df19wages_expanded.assign(Race=df19wages_expanded
                                               .rachsing
                                               .map(rachsing_dict))


# Check creation of new variable
df19wages_expanded.columns

Index(['perwt', 'sex', 'age', 'marst', 'rachsing', 'empstat', 'incwage',
       'incbus00', 'incearn', 'erscor90', 'agecat', 'educat', 'empstat',
       'labforce', 'empclass', 'Race'],
      dtype='object')


plt.figure(figsize=(12,7))

sb.boxplot(data=df19wages_expanded,
           x='educat',
           y='incwage',
           hue='Race',
           hue_order=rachsing_dict.values()
          )


# Title, Labels, and Ticks
xlocs, xlabels = plt.xticks()
ylocs, ylabels = plt.yticks()
ylabels = [f"${num/1e3:.0f}k" for num in ylocs]
plt.yticks(ticks=ylocs, labels=ylabels)

plt.ylabel('Wage Income')
plt.xlabel('Educational Attainment')

plt.title('The Distribution of Wage Income\n by Educational Attainment across Race/Ethnicity')


# Legend
plt.legend(loc='upper left', title='Race/Ethnicity')


# Change y limits
ylim_top = 3.1e5
plt.ylim(-1e4,ylim_top)

# Add text
plt.text(x=(len(xlocs)/2 - 0.5), y=ylim_top * (1 - 0.033), 
         s='Top-Coded Values Not Shown', ha='center');


# Create table of median wages by rachsing and educat
edurace_dist = (df19wages_expanded[['rachsing', 'educat', 'incwage']]
                     .groupby(by=['rachsing', 'educat'])
                     .incwage
                     .median().rename('median_wage').to_frame())

edurace_dist


# Create pivot table
edurace_dist.reset_index(inplace=True)
edurace_table = edurace_dist.pivot(index = 'rachsing', 
                                  columns = 'educat', 
                                  values = 'median_wage')
edurace_table


# Scale to the thousands
edurace_table/1000


# Create Median Wage Income Table
plt.figure(figsize=(8,5))

sb.heatmap(data=edurace_table/1000, 
           annot = True, 
           cmap='viridis_r', 
           linewidths=.5, 
           fmt = '.1f',
           square=True,
           yticklabels=list(rachsing_dict.values()),
           cbar_kws={'label': 'Thousands of Dollars ($1k)'})

plt.axvline(x=1, color='red')
plt.axvline(x=4, color='red')

plt.title('Median Wage Incomes as a Function of\n Educational Attainment and Race/Ethnicity')
plt.xlabel('Educational Attainment')
plt.ylabel('Race/Ethnicity');


# Create Mean Wage Income Table

edurace_dist = (df19wages_expanded[['rachsing', 'educat', 'incwage']]
                     .groupby(by=['rachsing', 'educat'])
                     .incwage
                     .mean().rename('mean_wage').to_frame())

edurace_dist.reset_index(inplace=True)
edurace_table = edurace_dist.pivot(index = 'rachsing', 
                                  columns = 'educat', 
                                  values = 'mean_wage')

plt.figure(figsize=(8,5))

sb.heatmap(data=edurace_table/1000, 
           annot = True, 
           cmap='viridis_r', 
           linewidths=.5, 
           fmt = '.1f',
           square=True,
           yticklabels=list(rachsing_dict.values()),
           cbar_kws={'label': 'Thousands of Dollars ($1k)'})

plt.axvline(x=1, color='red')
plt.axvline(x=4, color='red')

plt.title('Mean Wage Incomes as a Function of\n Educational Attainment and Race/Ethnicity')
plt.xlabel('Educational Attainment')
plt.ylabel('Race/Ethnicity');


# Create new variable `Sex` with interpreted values (Male/Female) rather than numbers
df19wages_expanded = df19wages_expanded.assign(Sex=df19wages_expanded
                                               .sex
                                               .map(sex_dict))


# Check creation of new variable
df19wages_expanded.columns

Index(['perwt', 'sex', 'age', 'marst', 'rachsing', 'empstat', 'incwage',
       'incbus00', 'incearn', 'erscor90', 'agecat', 'educat', 'empstat',
       'labforce', 'empclass', 'Race', 'Sex'],
      dtype='object')


# Create subplots looking at wage income by sex and other factors
x_var_dict = {'agecat': 'Age Category',
              'educat': 'Educational Attainment',
              'rachsing': 'Race/Ethnicity',
              'marst': 'Marital Status'}
x_order_list = [df19wages_expanded.agecat.cat.categories, 
                df19wages_expanded.educat.cat.categories, 
                rachsing_dict.keys(),
                marst_dict.keys()]
x_order_labels = [df19wages_expanded.agecat.cat.categories, 
                df19wages_expanded.educat.cat.categories, 
                rachsing_dict.values(),
                marst_dict.values()]


fig = plt.figure(figsize=(12,28))


for i, (x_var, x_order, x_labels) in enumerate(zip(x_var_dict.keys(), 
                                                  x_order_list, 
                                                  x_order_labels)):
    plt.subplot(4, 1, i+1)
    
    sb.boxplot(data=df19wages_expanded,
               x=x_var,
               y='incwage',
               order=x_order,
               hue='Sex',
               hue_order=['Male', 'Female'],
               palette='pastel'
              )

# Title, Labels, and Ticks

    xlocs, xlabels = plt.xticks()
    plt.xticks(ticks=xlocs, labels=x_labels)
    ylocs, ylabels = plt.yticks()
    ylabels = [f"${num/1e3:.0f}k" for num in ylocs]
    plt.yticks(ticks=ylocs, labels=ylabels)

    plt.ylabel('Wage Income')
    plt.xlabel(f"{x_var_dict[x_var]}")

    plt.title('The Distribution of Wage Income\n by ' + f"{x_var_dict[x_var]}" + ' and Sex')


# Legend
    plt.legend(loc='upper left', 
               bbox_to_anchor=(0, 1.12),
               ncol=2,
               title='Sex')

# Change y limits
    ylim_top = 3.1e5
    plt.ylim(-1e4,ylim_top)

# Add text
    plt.text(x=(len(xlocs)/2 - 0.5), y=ylim_top * (1 - 0.033), 
             s='Top-Coded Values Not Shown', ha='center')
    
# Adjust the layout
    fig.tight_layout()
    
plt.show();

	year	sample	serial	cbserial	hhwt	cluster	cpi99	statefip	strata	gq	...	incwage	incss	incinvst	incearn	erscor90
0	2019	201901	689720	2019010000012	11.0	2019006897201	0.652	27	110127	4	...	12000	0	0	12000	18.5
1	2019	201901	689721	2019010000014	23.0	2019006897211	0.652	27	110227	4	...	0	12000	0	0	999.9
2	2019	201901	689722	2019010000037	119.0	2019006897221	0.652	27	130227	3	...	0	8900	10	0	999.9
3	2019	201901	689723	2019010000095	85.0	2019006897231	0.652	27	100027	3	...	0	10300	0	0	999.9
4	2019	201901	689724	2019010000362	14.0	2019006897241	0.652	27	170027	4	...	0	9600	0	0	999.9

Encoding	Interpretation
1	Married, spouse present
2	Married, spouse absent
3	Separated
4	Divorced
5	Widowed
6	Never married/single

Encoding	Interpretation
1	White
2	Black/African American
3	American Indian/Alaska Native
4	Asian/Pacific Islander
5	Hispanic/Latino

Encoding	Interpretation
0	N/A
1	Self-employed
2	Works for wages

educat	NA/None	Grades 0-8	Grades 9-11	Grade 12	1yr College	2yr College	4yr College	5+yrs College
rachsing
1	20000.0	35000.0	32000.0	35000.0	38000.0	47000.0	60000.0	75000.0
2	35000.0	21100.0	23400.0	28800.0	33000.0	30000.0	55000.0	56000.0
3	66000.0	35000.0	18000.0	31500.0	13900.0	35000.0	45000.0	70000.0
4	25000.0	30000.0	34000.0	30700.0	23000.0	40000.0	60000.0	90000.0
5	33300.0	27100.0	27200.0	30000.0	24000.0	35000.0	48000.0	70000.0

Encoding	Interpretation
00	N/A or no schooling
01	Nursery school to grade 4
02	Grade 5, 6, 7, or 8
03	Grade 9
04	Grade 10
05	Grade 11
06	Grade 12
07	1 year of college
08	2 years of college
09	3 years of college
10	4 years of college
11	5+ years of college

		perwt
empstat	classwkr
1	1	261296.0
1	2	2614669.0
2	0	6320.0
	1	4520.0
	2	72575.0
3	0	796254.0
	1	55871.0
	2	389574.0

	inctot	ftotinc	incwage	incss	incinvst	incearn	empclass	age
0	12000	9999999	12000	0	0	12000	Employed & Works for Wages	21
1	12000	9999999	0	12000	0	0	Not in Labor Force & N/A	74
2	8910	9999999	0	8900	10	0	Not in Labor Force & N/A	79
3	10300	9999999	0	10300	0	0	Not in Labor Force & N/A	95
4	9600	9999999	0	9600	0	0	Not in Labor Force & N/A	79
5	55000	9999999	40000	15000	0	40000	Employed & Works for Wages	39
6	4600	9999999	4600	0	0	4600	Unemployed & Works for Wages	20
7	450	9999999	450	0	0	450	Not in Labor Force & Works for Wages	27
8	5400	9999999	0	5400	0	0	Not in Labor Force & N/A	67
9	17000	9999999	17000	0	0	17000	Not in Labor Force & Works for Wages	33

	inctot	ftotinc	incwage	incbus00	incss	incwelfr	incinvst	incretir	incsupp	incother	incearn	empclass	age
41078	17100	60500	0	0	17100	0	0	0	0	0	0	Employed & Works for Wages	65
42248	21100	90600	0	0	14400	0	0	0	6700	0	0	Employed & Works for Wages	70

	rachsing	educat	prop	perc
0	1	4yr College	0.299963	29.996286
1	1	Grade 12	0.265785	26.578530
2	1	1yr College	0.153116	15.311596
3	1	5+yrs College	0.134964	13.496393
4	1	2yr College	0.132589	13.258927
5	1	Grades 9-11	0.009477	0.947707
6	1	NA/None	0.002331	0.233110
7	1	Grades 0-8	0.001775	0.177450
8	2	Grade 12	0.372208	37.220803
9	2	1yr College	0.178599	17.859865
10	2	4yr College	0.144382	14.438232
11	2	2yr College	0.133206	13.320572
12	2	5+yrs College	0.095080	9.507962
13	2	NA/None	0.032628	3.262798
14	2	Grades 9-11	0.027019	2.701932
15	2	Grades 0-8	0.016878	1.687835
16	3	Grade 12	0.332507	33.250676
17	3	1yr College	0.178990	17.899008
18	3	Grades 9-11	0.167775	16.777502
19	3	4yr College	0.129565	12.956492
20	3	2yr College	0.086677	8.667719
21	3	5+yrs College	0.077547	7.754734
22	3	Grades 0-8	0.013864	1.386384
23	3	NA/None	0.013075	1.307484
24	4	Grade 12	0.253286	25.328605
25	4	4yr College	0.232232	23.223235
26	4	5+yrs College	0.207248	20.724755
27	4	1yr College	0.142378	14.237758
28	4	2yr College	0.092226	9.222560
29	4	NA/None	0.034596	3.459642
30	4	Grades 9-11	0.023519	2.351908
31	4	Grades 0-8	0.014515	1.451537
32	5	Grade 12	0.335122	33.512167
33	5	4yr College	0.162247	16.224741
34	5	1yr College	0.140190	14.019043
35	5	Grades 9-11	0.094964	9.496367
36	5	2yr College	0.086437	8.643713
37	5	Grades 0-8	0.084492	8.449158
38	5	5+yrs College	0.065368	6.536767
39	5	NA/None	0.031180	3.118045

	agecat	educat	prop	perc
0	20s	4yr College	0.291854	29.185361
1	20s	Grade 12	0.285905	28.590522
2	20s	1yr College	0.229653	22.965322
3	20s	2yr College	0.108910	10.890992
4	20s	5+yrs College	0.060415	6.041504
5	20s	Grades 9-11	0.014428	1.442799
6	20s	NA/None	0.005700	0.569978
7	20s	Grades 0-8	0.003135	0.313521
8	30s	4yr College	0.314377	31.437678
9	30s	Grade 12	0.222822	22.282234
10	30s	5+yrs College	0.171739	17.173874
11	30s	2yr College	0.138741	13.874070
12	30s	1yr College	0.114945	11.494481
13	30s	Grades 9-11	0.017718	1.771792
14	30s	Grades 0-8	0.011165	1.116502
15	30s	NA/None	0.008494	0.849368
16	40s	4yr College	0.282074	28.207417
17	40s	Grade 12	0.239189	23.918900
18	40s	5+yrs College	0.168374	16.837420
19	40s	2yr College	0.139599	13.959912
20	40s	1yr College	0.129548	12.954824
21	40s	Grades 9-11	0.021187	2.118703
22	40s	Grades 0-8	0.011751	1.175054
23	40s	NA/None	0.008278	0.827771

educat	NA/None	Grades 0-8	Grades 9-11	Grade 12	1yr College	2yr College	4yr College	5+yrs College
rachsing
1	20.0	35.0	32.0	35.0	38.0	47.0	60.0	75.0
2	35.0	21.1	23.4	28.8	33.0	30.0	55.0	56.0
3	66.0	35.0	18.0	31.5	13.9	35.0	45.0	70.0
4	25.0	30.0	34.0	30.7	23.0	40.0	60.0	90.0
5	33.3	27.1	27.2	30.0	24.0	35.0	48.0	70.0

Encoding	Interpretation
1	Male
2	Female

Part I - Exploration of 2019 Wage Income in Minnesota¶

by Brian Allan Woodcock¶

July 2022¶

Outline¶

Introduction¶

References¶

Preliminary Wrangling¶

What is the structure of your dataset?¶

What is/are the main feature(s) of interest in your dataset?¶

What features in the dataset do you think will help support your investigation into your feature(s) of interest?¶

Univariate Exploration¶

Person Weight -- perwt¶

Observations¶

Observations¶

Age Distribution in 2019¶

Observations¶

Age Categories -- agecat¶

Observations¶

Sex¶

Observations¶

Marital Status¶

Observations¶

Race/Ethnicity¶

Observations¶

Educational Attainment¶

Observations¶

Employment¶

Observations¶

Earned Income -- Wages and Business Income¶

Investigating Wage Income -- incwage¶

Observations¶

Investigating Earned Income (incearn) and Business Income (incbus00)¶

Observations¶

Observations¶

Question¶

Decision for Subsequent Analyses¶

Create DataFrame of Employed Wage Earners¶

Note¶

Observations¶

Bivariate Exploration¶

Create Expanded DataFrame in Place of Weights¶

Wage Income by Age¶

Observations¶

Observations¶

Observations¶

Observations¶

Transfer of People Out of the Labor Force¶

Observations¶

Wage Income by Sex¶

Observations¶

Observations¶

Wage Income by Marital Status¶

Observations¶

Observations¶

Wage Income by Race/Ethnicity¶

Observations¶

Wage Income by Educational Attainment¶

Observations¶

Distribution of Levels of Educational Attainment by Race/Ethnicity¶

Observations¶

Distribution of Levels of Educational Attainment by Age Category Under 50¶

Observations¶

Talk about some of the relationships you observed in this part of the investigation. How did the feature(s) of interest vary with other features in the dataset?¶

Did you observe any interesting relationships between the other features (not the main feature(s) of interest)?¶

Multivariate Exploration¶

Wage Income by Educational Attainment and Age Category under 50¶

Observations¶

The Impact of Education on Wage Income across Racial/Ethnic Categories¶

Observations¶

Observations¶

Observations on the Heatmap Tables¶

Sex Differences in Wage Income¶

Observations¶

Talk about some of the relationships you observed in this part of the investigation. Were there features that strengthened each other in terms of looking at your feature(s) of interest?¶

Were there any interesting or surprising interactions between features?¶

Conclusions¶

Outline ¶

Introduction ¶

Preliminary Wrangling ¶

Univariate Exploration ¶

Person Weight -- `perwt`¶

Age Distribution in 2019 ¶

Age Categories -- `agecat`¶

Sex ¶

Marital Status ¶

Race/Ethnicity ¶

Educational Attainment ¶

Employment ¶

Earned Income -- Wages and Business Income ¶

Investigating Wage Income -- `incwage`¶

Investigating Earned Income (`incearn`) and Business Income (`incbus00`)¶

Bivariate Exploration ¶

Wage Income by Age ¶

Wage Income by Sex ¶

Wage Income by Marital Status ¶

Wage Income by Race/Ethnicity ¶

Wage Income by Educational Attainment ¶

Distribution of Levels of Educational Attainment by Race/Ethnicity ¶

Distribution of Levels of Educational Attainment by Age Category Under 50 ¶

Multivariate Exploration ¶

Wage Income by Educational Attainment and Age Category under 50 ¶

The Impact of Education on Wage Income across Racial/Ethnic Categories ¶

Sex Differences in Wage Income ¶

Conclusions ¶