## The data:
import pandas as pd
import numpy as np
df = pd.DataFrame({'Mileage' : ['Low']*5 + ['High']*12,
'Range' : [240, 241, 236, 240, 240, 230, 227, 232,
255.4381603, 267.2467371, 225, 238,
223, 225, 218, 227, 225]})
## STEP 1: Calculate the sample statistic
## STEP 2: Randomization distribution
## STEP 3: Calculate p-value. Direction of inequality agrees with Ha above!!!
## The data:
import pandas as pd
import numpy as np
df = pd.DataFrame({'Mileage' : ['Low']*5 + ['High']*12,
'Range' : [240, 241, 236, 240, 240, 230, 227, 232,
255.4381603, 267.2467371, 225, 238,
223, 225, 218, 227, 225]})
## STEP 1: Calculate the sample statistic
means = df.groupby('Mileage').mean()
sampdiff = means.loc['Low', 'Range'] - means.loc['High', 'Range']
print("Low Mileage Mean = ", means.loc['Low', 'Range'])
print("High Mileage Mean = ", means.loc['High', 'Range'])
print("Sample difference of means = ", sampdiff)
## STEP 2: Randomization distribution
sim = df.copy(deep = True)
N = 1000
n = sim.shape[0] # num. rows
xbardiff = np.empty(N)
for i in range(N):
sim['Range'] = np.random.choice(sim['Range'], size=n, replace=False)
means = sim.groupby('Mileage').mean()
xbardiff[i] = means.loc['Low', 'Range'] - means.loc['High', 'Range']
## STEP 3: Calculate p-value. Direction of inequality agrees with Ha above!!!
print("p-value =", len(xbardiff[xbardiff >= sampdiff])/N)