ANOVA Test & Post hoc Test - Tukey HSD
# =============================================================
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf
import statsmodels.stats.multicomp as multi
import pandas as pd
import statsmodels.formula.api as smf
import statsmodels.stats.multicomp as multi
data = pd.read_csv(‘nesarc_pds.csv’, low_memory=False)
#setting variables you will be working with to numeric
data['S1Q10A’] = pd.to_numeric(data['S1Q10A’], errors='coerce’)
data['S3AQ3B1’] = pd.to_numeric(data['S3AQ3B1’], errors='coerce’)
data['S3AQ3C1’] = pd.to_numeric(data['S3AQ3C1’], errors='coerce’)
data['CHECK321’] = pd.to_numeric(data['CHECK321’], errors='coerce’)
data['S1Q10A’] = pd.to_numeric(data['S1Q10A’], errors='coerce’)
data['S3AQ3B1’] = pd.to_numeric(data['S3AQ3B1’], errors='coerce’)
data['S3AQ3C1’] = pd.to_numeric(data['S3AQ3C1’], errors='coerce’)
data['CHECK321’] = pd.to_numeric(data['CHECK321’], errors='coerce’)
#subset data to those people who have income and have smoked in the past 12 months
sub=data[ (data['S1Q10B’]!=0) & (data['CHECK321’]==1)]
sub=data[ (data['S1Q10B’]!=0) & (data['CHECK321’]==1)]
sub.loc[:,'S3AQ3B1’]=sub['S3AQ3B1’].replace(9, np.nan)
sub.loc[:,'S3AQ3C1’]=sub['S3AQ3C1’].replace(99, np.nan)
sub.loc[:,'S3AQ3C1’]=sub['S3AQ3C1’].replace(99, np.nan)
#recoding number of days smoked in the past month
recode1 = {1: 30, 2: 22, 3: 14, 4: 5, 5: 2.5, 6: 1}
sub['USFREQMO’]= sub.loc[:,'S3AQ3B1’].map(recode1)
recode1 = {1: 30, 2: 22, 3: 14, 4: 5, 5: 2.5, 6: 1}
sub['USFREQMO’]= sub.loc[:,'S3AQ3B1’].map(recode1)
# mean value of personal income in last 12 month
avgIncome = sub.loc[:,'S1Q10A’].mean(0)
LvIncome = [0, sub.loc[:,'S1Q10A’].quantile(0.3), sub.loc[:,'S1Q10A’].quantile(0.7), sub.loc[:,'S1Q10A’].max()]
avgIncome = sub.loc[:,'S1Q10A’].mean(0)
LvIncome = [0, sub.loc[:,'S1Q10A’].quantile(0.3), sub.loc[:,'S1Q10A’].quantile(0.7), sub.loc[:,'S1Q10A’].max()]
# Split into 3 groups: 'low’, 'medium’, 'high’
splitIntoCat = pd.cut(sub.S1Q10A, LvIncome, labels=['low’, 'medium’, 'high’])
sub['INCOMECAT’] = splitIntoCat
splitIntoCat = pd.cut(sub.S1Q10A, LvIncome, labels=['low’, 'medium’, 'high’])
sub['INCOMECAT’] = splitIntoCat
# num of ciga per month
sub['NUMCIGMO_EST’]=sub['USFREQMO’] * sub['S3AQ3C1’]
sub1 = sub[['NUMCIGMO_EST’, 'INCOMECAT’]].dropna()
sub['NUMCIGMO_EST’]=sub['USFREQMO’] * sub['S3AQ3C1’]
sub1 = sub[['NUMCIGMO_EST’, 'INCOMECAT’]].dropna()
# using ols function for calculating the F-statistic and associated p value
model = smf.ols(formula='NUMCIGMO_EST ~ C(INCOMECAT)’, data=sub1)
results = model.fit()
print (results.summary())
model = smf.ols(formula='NUMCIGMO_EST ~ C(INCOMECAT)’, data=sub1)
results = model.fit()
print (results.summary())
print ('means for numcigmo_est by personal income’)
print(sub1.groupby('INCOMECAT’).mean())
print(sub1.groupby('INCOMECAT’).mean())
print ('standard deviations for numcigmo_est by personal income’)
print(sub1.groupby('INCOMECAT’).std())
print(sub1.groupby('INCOMECAT’).std())
# ============ Post hoc test, Tukey HSD ===========================
mc1 = multi.MultiComparison(sub1['NUMCIGMO_EST’], sub1['INCOMECAT’])
res1 = mc1.tukeyhsd()
print(res1.summary())
res1 = mc1.tukeyhsd()
print(res1.summary())
댓글 없음:
댓글 쓰기