import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
%matplotlib inline
#We are setting the seed to assure you get the same answers on quizzes as we set up
random.seed(42)


df = pd.read_csv('ab_data.csv')
df.head()


print('The number of rows in the data set is: {}'.format(df.shape[0]))

The number of rows in the data set is: 294478


num_unique_users = df.user_id.nunique()
print('The number of unique users in the dataset is: {}'.format(num_unique_users))

The number of unique users in the dataset is: 290584


prop_converted = df.query('converted == 1').user_id.nunique() / num_unique_users
print('The proportion of unique users in the dataset who converted is: {}'.format(round(prop_converted, 5)))

The proportion of unique users in the dataset who converted is: 0.12104


num_mismatches = ((df.group == 'treatment') != (df.landing_page == 'new_page')).sum()
print('The number of times \'new_page\' did not line up with \'treatment\' is: {}'.format(num_mismatches))

The number of times 'new_page' did not line up with 'treatment' is: 3893


num_missing_val = df.isnull().any(axis = 1).sum()
print('The number of rows with missing values is: {}'.format(num_missing_val))

The number of rows with missing values is: 0


# Create new dataset, dropping all mismatches between 'treatment' and 'new_page'
df2 = df.query('(group == "treatment") == (landing_page == "new_page")')
df2.shape[0]

290585


# Check: total rows in original dataset - number of mismatches = length of df2
df.shape[0] - num_mismatches == df2.shape[0]

True


# Double Check all of the correct rows were removed - this should be 0
df2[((df2['group'] == 'treatment') == (df2['landing_page'] == 'new_page')) == False].shape[0]

0


n_tot = df2.user_id.nunique()
print('The number of unique users in df2 is: {}'.format(n_tot))

The number of unique users in df2 is: 290584


repeat_user = df2[df2.duplicated('user_id', keep=False)]
repeat_user


# Get the index values for the rows
repeat_user.index

Int64Index([1899, 2893], dtype='int64')


# Use an index value to obtain the repeated user_id
dup_id = repeat_user.at[repeat_user.index[0], 'user_id']
print('The repeated user_id is: {}'.format(dup_id))

The repeated user_id is: 773192


repeat_user


# Dropping the last row of the repeated user
df2 = df2.drop_duplicates(subset='user_id', keep='first')


# Check that it was dropped
df2[df2['user_id'] == dup_id]


# Check that df2 only contains unique user_ids
len(df2) == df.user_id.nunique()

True


round(df2.converted.mean(), 5)

0.1196


obs_control_convert_rate = df2.query('group == "control"').converted.mean()
round(obs_control_convert_rate, 5)

0.12039


obs_treatment_convert_rate = df2.query('group == "treatment"').converted.mean()
round(obs_treatment_convert_rate, 5)

0.11881


obs_diff = obs_treatment_convert_rate - obs_control_convert_rate
print('The observed difference in conversion rates between\n \
the treatment and the control groups is: {}'.format(obs_diff))

The observed difference in conversion rates between
 the treatment and the control groups is: -0.0015782389853555567


round((df2['landing_page'] == 'new_page').mean(), 5)

0.50006


# We use the pooled proportion for the convert rates, new and old, under the null
p_pooled = df2.converted.mean()
round(p_pooled, 5)

0.1196


# We use the pooled proportion for the convert rates, new and old, under the null
round(p_pooled, 5)

0.1196


n_new = (df2['landing_page'] == 'new_page').sum()
print('The treatment sample size is: {}'.format(n_new))

The treatment sample size is: 145310


n_old = (df2['landing_page'] == 'old_page').sum()
print('The control sample size is: {}'.format(n_old))

The control sample size is: 145274


#Check: n_new + n_old = n_tot
n_new + n_old == n_tot

True


new_page_converted = np.random.choice([0, 1], size=n_new, p=[1 - p_pooled, p_pooled])
new_page_converted.mean()

0.11933108526598307


old_page_converted = np.random.choice([0, 1], size=n_old, p=[1 - p_pooled, p_pooled])
old_page_converted.mean()

0.11803901592852127


new_page_converted.mean() - old_page_converted.mean()

0.0012920693374618014


# The following process mimics the one used above but adds a loop of 10000 which takes a long time.
# It is faster to use the numpy built-in operation np.random.binomial(). See below.

# p_diffs = []

#for _ in range(10000):
#    new_page_converted = np.random.choice([0, 1], size=n_new, p=[1 - p_pooled, p_pooled])
#    old_page_converted = np.random.choice([0, 1], size=n_old, p=[1 - p_pooled, p_pooled])
#    p_diffs.append(new_page_converted.mean() - old_page_converted.mean())

##########
# Create sampling distribution for differences in conversion rates
new_page_converted_props = np.random.binomial(n_new, p_pooled, 10000)/n_new
old_page_converted_props = np.random.binomial(n_old, p_pooled, 10000)/n_old

p_diffs = np.array(new_page_converted_props - old_page_converted_props)


# Plot null sampling distribution
plt.hist(p_diffs, bins = "auto")

# Plot line for observed statistic
plt.axvline(x = obs_diff, color = 'r', linewidth = 2)

<matplotlib.lines.Line2D at 0x7f9f98099eb0>


(p_diffs > obs_diff).mean()

0.9042


import statsmodels.api as sm

convert_old = df2.query('landing_page == "old_page"').converted.sum()
convert_new = df2.query('landing_page == "new_page"').converted.sum()
n_old = (df2['landing_page'] == 'old_page').sum()
n_new = (df2['landing_page'] == 'new_page').sum()
convert_old, convert_new, n_old, n_new

(17489, 17264, 145274, 145310)


count=[convert_new, convert_old]
nobs=[n_new, n_old]
zstat, pvalue = sm.stats.proportions_ztest(count, nobs, value=0, alternative='larger')
print('The computed z-score is: {}'.format(zstat))
print('The computed p-value is: {}'.format(pvalue))

The computed z-score is: -1.3109241984234394
The computed p-value is: 0.9050583127590245


df3 = df2.join(pd.get_dummies(df2['group']))    # Add dummy variables for group
df3['intercept'] = 1     # Add intercept


# Clean up columns
df3.drop(columns=['control'], inplace=True)
df3.rename(columns= {'treatment': 'ab_page'}, inplace=True)
df3.head()


mod1 = sm.Logit(df3['converted'], df3[['intercept', 'ab_page']])   # Instantiate the model
result1 = mod1.fit()       # Fit the model

Optimization terminated successfully.
         Current function value: 0.366118
         Iterations 6


result1.summary()


# Create new df
countries_df = pd.read_csv('countries.csv')
df_new = countries_df.set_index('user_id').join(df3.set_index('user_id'), how='inner')
df_new.head()


# Create dummy variables for country
df_new = df_new.join(pd.get_dummies(df_new['country']))
df_new.head()


# Create a model with country variables as predictors
mod2 = sm.Logit(df_new['converted'], df_new[['intercept', 'UK', 'US']])   # Instantiate the model
result2 = mod2.fit()       # Fit the model
result2.summary()

Optimization terminated successfully.
         Current function value: 0.366116
         Iterations 6


# Create model with page and country variables as predictors
mod3 = sm.Logit(df_new['converted'], df_new[['intercept', 'ab_page', 'UK', 'US']])   # Instantiate the model
result3 = mod3.fit()       # Fit the model
result3.summary()

Optimization terminated successfully.
         Current function value: 0.366113
         Iterations 6


np.exp(result3.params)

intercept    0.131332
ab_page      0.985168
UK           1.051944
US           1.041599
dtype: float64


# Create interaction variables for ab_page with UK and US
df_new['ab_UK'] = df_new['ab_page'] * df_new['UK']
df_new['ab_US'] = df_new['ab_page'] * df_new['US']
df_new.head()


# Create model with page variable, country variables, and interaction variables
mod4 = sm.Logit(df_new['converted'], df_new[['intercept', 'ab_page', 'UK', 'US', 'ab_UK', 'ab_US']])   # Instantiate the model
result4 = mod4.fit()       # Fit the model
result4.summary()

Optimization terminated successfully.
         Current function value: 0.366109
         Iterations 6

Dep. Variable:	converted	No. Observations:	290584
Model:	Logit	Df Residuals:	290582
Method:	MLE	Df Model:	1
Date:	Sat, 15 Apr 2023	Pseudo R-squ.:	8.077e-06
Time:	14:39:53	Log-Likelihood:	-1.0639e+05
converged:	True	LL-Null:	-1.0639e+05
Covariance Type:	nonrobust	LLR p-value:	0.1899

	coef	std err	z	P>\|z\|	[0.025	0.975]
intercept	-1.9888	0.008	-246.669	0.000	-2.005	-1.973
ab_page	-0.0150	0.011	-1.311	0.190	-0.037	0.007

Dep. Variable:	converted	No. Observations:	290584
Model:	Logit	Df Residuals:	290581
Method:	MLE	Df Model:	2
Date:	Sat, 15 Apr 2023	Pseudo R-squ.:	1.521e-05
Time:	14:39:54	Log-Likelihood:	-1.0639e+05
converged:	True	LL-Null:	-1.0639e+05
Covariance Type:	nonrobust	LLR p-value:	0.1984

	coef	std err	z	P>\|z\|	[0.025	0.975]
intercept	-2.0375	0.026	-78.364	0.000	-2.088	-1.987
UK	0.0507	0.028	1.786	0.074	-0.005	0.106
US	0.0408	0.027	1.518	0.129	-0.012	0.093

Dep. Variable:	converted	No. Observations:	290584
Model:	Logit	Df Residuals:	290580
Method:	MLE	Df Model:	3
Date:	Sat, 15 Apr 2023	Pseudo R-squ.:	2.323e-05
Time:	14:39:55	Log-Likelihood:	-1.0639e+05
converged:	True	LL-Null:	-1.0639e+05
Covariance Type:	nonrobust	LLR p-value:	0.1760

Analyze A/B Test Results¶

Brian Allan Woodcock¶

Course: "Practical Statistics" as part of Udacity's Data Analyst Nanodegree Program¶

Table of Contents¶

Introduction¶

Part I - Probability¶

Part II - A/B Test¶

Part III - A Regression Approach¶

Conclusions¶

	user_id	timestamp	group	landing_page	converted
0	851104	2017-01-21 22:11:48.556739	control	old_page	0
1	804228	2017-01-12 08:01:45.159739	control	old_page	0
2	661590	2017-01-11 16:55:06.154213	treatment	new_page	0
3	853541	2017-01-08 18:28:03.143765	treatment	new_page	0
4	864975	2017-01-21 01:52:26.210827	control	old_page	1

	user_id	timestamp	group	landing_page	converted
1899	773192	2017-01-09 05:37:58.781806	treatment	new_page	0
2893	773192	2017-01-14 02:55:59.590927	treatment	new_page	0

	country	timestamp	group	landing_page	converted	ab_page	intercept
user_id
834778	UK	2017-01-14 23:08:43.304998	control	old_page	0	0	1
928468	US	2017-01-23 14:44:16.387854	treatment	new_page	0	1	1
822059	UK	2017-01-16 14:04:14.719771	treatment	new_page	1	1	1
711597	UK	2017-01-22 03:14:24.763511	control	old_page	0	0	1
710616	UK	2017-01-16 13:14:44.000513	treatment	new_page	0	1	1

	coef	std err	z	P>\|z\|	[0.025	0.975]
intercept	-2.0040	0.036	-55.008	0.000	-2.075	-1.933
ab_page	-0.0674	0.052	-1.297	0.195	-0.169	0.034
UK	0.0118	0.040	0.296	0.767	-0.066	0.090
US	0.0175	0.038	0.465	0.642	-0.056	0.091
ab_UK	0.0783	0.057	1.378	0.168	-0.033	0.190
ab_US	0.0469	0.054	0.872	0.383	-0.059	0.152