# Here, you are provided with code to "recreate" the dataset summarized in the two-way table above:
import pandas as pd
n_exposed = 6+16 # Number of those "exposed"
n_unexposed = 399+2064 # Number of those "unexposed"
df = pd.DataFrame({'Exposure' : ['Exposed']*n_exposed + ['Unexposed']*n_unexposed, # replicate "Exposed" and "Unexposed" the appropriate number of times and store in a column
'ALL' : ['Yes']*6 + ['No']*16 + ['Yes']*399 + ['No']*2064}) # replicate "Yes" and "No" the appropriate number of time (and in the appropriate rows) and store in a column
# Here, you are provided with code to "recreate" the dataset summarized in the two-way table above:
import pandas as pd
n_exposed = 6+16 # Number of those "exposed"
n_unexposed = 399+2064 # Number of those "unexposed"
df = pd.DataFrame({'Exposure' : ['Exposed']*n_exposed + ['Unexposed']*n_unexposed, # replicate "Exposed" and "Unexposed" the appropriate number of times and store in a column
'ALL' : ['Yes']*6 + ['No']*16 + ['Yes']*399 + ['No']*2064}) # replicate "Yes" and "No" the appropriate number of time (and in the appropriate rows) and store in a column
# 1. Write your null and alternative hypotheses.
# 2. Perform the hypothesis test using a randomization distribution.
# 3. Report the p-value.
# 4. What is the conclusion of your test? State this in terms of the hypotheses, and also in terms of the original question above. Make sure to indicate the significance level you are using.
# Here, you are provided with code to "recreate" the dataset summarized in the two-way table above:
import pandas as pd
n_exposed = 6+16 # Number of those "exposed"
n_unexposed = 399+2064 # Number of those "unexposed"
df = pd.DataFrame({'Exposure' : ['Exposed']*n_exposed + ['Unexposed']*n_unexposed, # replicate "Exposed" and "Unexposed" the appropriate number of times and store in a column
'ALL' : ['Yes']*6 + ['No']*16 + ['Yes']*399 + ['No']*2064}) # replicate "Yes" and "No" the appropriate number of time (and in the appropriate rows) and store in a column
# 1. Write your null and alternative hypotheses.
# H_o: p = 0.000032 # null has to have =
# H_a : p > 0.000032 # use > because sample proportion (below) is larger than 0.000032
# 2. Perform the hypothesis test using a randomization distribution.
dataphat = (399+6)/2019054 # sample proportion from study
import numpy as np
n = 2019054
p = 32/1000000
N = 1000
cnts = np.random.binomial(n=n, p=p, size=N) # p is value from null hyp.; generates the numerator of proportion above
phat = cnts/n # randomization distribution of sample proportions
phatdf = pd.DataFrame(phat, columns = ['phat']) # convert to DataFrame
# 3. Report the p-value.
print("p-value :", len(phatdf[phatdf['phat'] >= dataphat])/N)
# 4. What is the conclusion of your test? State this in terms of the hypotheses, and also in terms of the original question above. Make sure to indicate the significance level you are using.
# Because the p-value is 0, we can reject the null hypothesis at the 0.05 significance level, in favor of the alternative that the proportion of ALL among children in PA is greater than the national rate.