# -*- coding: utf-8 -*-
"""
Created on Wed May 25 08:52:38 2016
@author: DH KIM
"""
# ANOVA
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf
import seaborn as sns
import matplotlib.pyplot as plt
data = pd.read_csv('nesarc_pds.csv', low_memory=False)
#setting variables you will be working with to numeric
data['S1Q10A'] = pd.to_numeric(data['S1Q10A'], errors='coerce')
data['S3AQ3B1'] = pd.to_numeric(data['S3AQ3B1'], errors='coerce')
data['S3AQ3C1'] = pd.to_numeric(data['S3AQ3C1'], errors='coerce')
data['CHECK321'] = pd.to_numeric(data['CHECK321'], errors='coerce')
data['SEX'] = pd.to_numeric(data['SEX'], errors='coerce')
#subset data to those people who have income and have smoked in the past 12 months
sub=data[ (data['S1Q10B']!=0) & (data['CHECK321']==1)]
sub2 = sub[['SEX', 'S3AQ3B1', 'S3AQ3C1', 'S1Q10A']].copy()
sub2['S3AQ3B1']=sub2['S3AQ3B1'].replace(9, np.nan)
sub2['S3AQ3C1']=sub2['S3AQ3C1'].replace(99, np.nan)
recode1 = {1: 30, 2: 22, 3: 14, 4: 5, 5: 2.5, 6: 1}
sub2['USFREQMO']= sub2['S3AQ3B1'].map(recode1)
# mean value of personal income in last 12 month
avgIncome = sub2['S1Q10A'].mean()
LvIncome = [0, sub2['S1Q10A'].quantile(0.3), sub2['S1Q10A'].quantile(0.7), sub2['S1Q10A'].max()]
# Split into 3 groups: 'low', 'medium', 'high'
splitIntoCat = pd.cut(sub2.S1Q10A, LvIncome, labels=['low', 'medium', 'high'])
sub2['INCOMECAT'] = splitIntoCat
# num of ciga per month
sub2['NUMCIGMO_EST']=sub2['USFREQMO'] * sub2['S3AQ3C1']
sub1 = sub2[['NUMCIGMO_EST', 'INCOMECAT', 'SEX']].dropna()
sub1.SEX = sub1.SEX - 1 # 0: Male, 1: Female
# using ols function for calculating the F-statistic and associated p value
model = smf.ols(formula='NUMCIGMO_EST ~ C(INCOMECAT)', data=sub1).fit()
print (model.summary())
print ('means for smoking amount by personal income')
print(sub1.groupby('INCOMECAT').mean())
print ('standard deviations for smoking amount by personal income')
print(sub1.groupby('INCOMECAT').std())
sns.factorplot(x="INCOMECAT", y="NUMCIGMO_EST", data=sub1, kind="bar", ci=None)
plt.xlabel('Income Level')
plt.ylabel('Smoking Amount')
# Moderator: SEX
print ('means for numcigmo_est by personal income according to SEX')
print(sub1.groupby('SEX').mean())
sub_m=sub1[ sub1.SEX == 0]
sub_f=sub1[ sub1.SEX == 1]
print ('Association between income level and smoking amount for Male')
model_m = smf.ols(formula='NUMCIGMO_EST ~ C(INCOMECAT)', data=sub_m).fit()
print (model_m.summary())
print ('Association between income level and smoking amount for Female')
model_f = smf.ols(formula='NUMCIGMO_EST ~ C(INCOMECAT)', data=sub_f).fit()
print (model_f.summary())
print ('Means for smoking amount by personal income for Male')
print (sub_m.groupby('INCOMECAT').mean())
print ('Means for smoking amount by personal income for Female')
print (sub_f.groupby('INCOMECAT').mean())
sns.factorplot(x="INCOMECAT", y="NUMCIGMO_EST", data=sub_m, kind="bar", ci=None)
plt.title('Smoking Amount according to income level for Male')
plt.xlabel('Income Level')
plt.ylabel('Smoking Amount')
sns.factorplot(x="INCOMECAT", y="NUMCIGMO_EST", data=sub_f, kind="bar", ci=None)
plt.title('Smoking Amount according to income level for Female')
plt.xlabel('Income Level')
plt.ylabel('Smoking Amount')
In my life
Interesting, intriguing
5/25/2016
5/17/2016
Correlation Coefficient - Pearson coefficient
It’s a sequel to the 1st assignment - study on the relationship b.t.w personal income and smoking amount. It’s already been verified through ANOVA test that there’re no significant relationship.
Plus, I was interested in association b.t.w personal & family income. To get better meaningful data, I should refine the income data. I got the mean & std value of the personal and family income. And then, discarded the data over 1 std from the mean value as np.nan.(refer to the below)
sub[sub.S1Q10A >= (sub.S1Q10A.mean()+sub.S1Q10A.std())] = np.nan sub[sub.S1Q11A >= (sub.S1Q11A.mean()+sub.S1Q11A.std())] = np.nan
The results are,
Association between personal income and smoking amount
(0.034107681918807302, 0.0025274573221202926)
(0.034107681918807302, 0.0025274573221202926)
Pearson coefficient is almost close to zero, however, the P-value is under significant level(alpha=0.05). The results made me confused because the plot looks there’re no relationship btw them while the P-value told me there’re certain related to one another. Probably, there’re missing factor beyond my knowledge, however, I can’t find the why for now. I tend to think that there is no ‘Linear’ relationship. But, obviously It seemed they have no relationship from that scatter plot.(and also I know the ANOVA results)
Association between personal income and family income
(0.66086621192976558, 0.0)
(0.66086621192976558, 0.0)
The 2nd results are crystal clear to say that there’re in very strong positive relationship and also significantly related to one another. The interesting part is P-value is ZERO (Is it possible?) which means there’s any chance to be in error when rejecting the null hypothesis.
And one more thing, everyone can earn more than at least family income(Is it typical in USA? or Must be wrong something;;) - graph is like a upper diagonal matrix.
Python code: Chi-square & Post hoc Test
“”“
S1Q10A: Total personal income in last 12M (Quantitative)
SINTOCAT: Total personal imcome in last 12M (Categorical, Low: 0~30%, Medium: 31~70%, High: 71%~max)
DMAJORDEPSNI12: Major depression in last 12M(Illness-induced ruled out) (0:N, 1:Y)
S1Q10A: Total personal income in last 12M (Quantitative)
SINTOCAT: Total personal imcome in last 12M (Categorical, Low: 0~30%, Medium: 31~70%, High: 71%~max)
DMAJORDEPSNI12: Major depression in last 12M(Illness-induced ruled out) (0:N, 1:Y)
@author: Daehwan, Kim
”“”
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats
”“”
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats
data = pd.read_csv(“nesarc_pds.csv”, low_memory=False)
LvIncome = [0, data.S1Q10A.quantile(0.3), data.S1Q10A.quantile(0.7), data.S1Q10A.max()]
splitIntoCat = pd.cut(data.S1Q10A, LvIncome, labels=[‘low’, 'medium’, 'high’])
splitIntoCat = pd.cut(data.S1Q10A, LvIncome, labels=[‘low’, 'medium’, 'high’])
sub = data.copy()
sub['SINTOCAT’] = splitIntoCat
sub['SINTOCAT’] = splitIntoCat
ct = pd.crosstab(sub.DMAJORDEPSNI12, sub.SINTOCAT, colnames=['Income Level’], rownames=['Depression’])
print(ct)
pct=ct/(ct.sum(axis=0))
print(pct)
print(pct)
sns.factorplot(x=“SINTOCAT”, y=“DMAJORDEPSNI12”, data=sub, ci=None, kind=“bar”)
plt.xlabel('Personal income in last 12M’)
plt.ylabel('Pct of people have depression’)
plt.xlabel('Personal income in last 12M’)
plt.ylabel('Pct of people have depression’)
chi2, p_val, df, exct =scipy.stats.chi2_contingency(ct)
print(’*** chi-squre value, p-value, dof’)
print('chi2: %0.2f\np-value: %s\ndof: %d\n\n’ % (chi2, p_val, df) )
print(’*** chi-squre value, p-value, dof’)
print('chi2: %0.2f\np-value: %s\ndof: %d\n\n’ % (chi2, p_val, df) )
comp_lvm = sub[['SINTOCAT’, 'DMAJORDEPSNI12’]].query('SINTOCAT == “low” or SINTOCAT == “medium”’)
ct_lvm = pd.crosstab(comp_lvm.DMAJORDEPSNI12, comp_lvm.SINTOCAT).drop('high’, 1)
print(ct_lvm)
print(ct_lvm)
pct_lvm=ct_lvm/(ct_lvm.sum(axis=0))
print(pct_lvm)
print(pct_lvm)
chi2, p_val, df, exct =scipy.stats.chi2_contingency(ct_lvm)
print(’*** chi-squre value, p-value, dof’)
print('chi2: %0.2f\np-value: %s\ndof: %d\n’ % (chi2, p_val, df) )
print(’*** chi-squre value, p-value, dof’)
print('chi2: %0.2f\np-value: %s\ndof: %d\n’ % (chi2, p_val, df) )
comp_lvm = sub[['SINTOCAT’, 'DMAJORDEPSNI12’]].query('SINTOCAT == “low” or SINTOCAT == “high”’)
ct_lvm = pd.crosstab(comp_lvm.DMAJORDEPSNI12, comp_lvm.SINTOCAT).drop('medium’, 1)
print(ct_lvm)
ct_lvm = pd.crosstab(comp_lvm.DMAJORDEPSNI12, comp_lvm.SINTOCAT).drop('medium’, 1)
print(ct_lvm)
pct_lvm=ct_lvm/(ct_lvm.sum(axis=0))
print(pct_lvm)
print(pct_lvm)
chi2, p_val, df, exct =scipy.stats.chi2_contingency(ct_lvm)
print(’*** chi-squre value, p-value, dof’)
print('chi2: %0.2f\np-value: %s\ndof: %d\n’ % (chi2, p_val, df) )
print(’*** chi-squre value, p-value, dof’)
print('chi2: %0.2f\np-value: %s\ndof: %d\n’ % (chi2, p_val, df) )
comp_lvm = sub[['SINTOCAT’, 'DMAJORDEPSNI12’]].query('SINTOCAT == “medium” or SINTOCAT == “high”’)
ct_lvm = pd.crosstab(comp_lvm.DMAJORDEPSNI12, comp_lvm.SINTOCAT).drop('low’, 1)
print(ct_lvm)
ct_lvm = pd.crosstab(comp_lvm.DMAJORDEPSNI12, comp_lvm.SINTOCAT).drop('low’, 1)
print(ct_lvm)
pct_lvm=ct_lvm/(ct_lvm.sum(axis=0))
print(pct_lvm)
print(pct_lvm)
chi2, p_val, df, exct =scipy.stats.chi2_contingency(ct_lvm)
print(’*** chi-squre value, p-value, dof’)
print('chi2: %0.2f\np-value: %s\ndof: %d\n’ % (chi2, p_val, df) )
print(’*** chi-squre value, p-value, dof’)
print('chi2: %0.2f\np-value: %s\ndof: %d\n’ % (chi2, p_val, df) )
Model Interpretation for Chi-Square and Post Hoc Tests:
I’d like to study about the relationship btw total personal income(explanatory) and depression(response) by using NESARC.
‘S1Q10A’: Total personal income in last 12M (Quantitative) ‘DMAJORDEPSNI12′: Major depression in last 12M (Illness-induced ruled out)
I refined Quantitative data(‘S1Q10A’ ) into 3-levels as ‘low’, ‘medium’, ‘high’ according to the amount of personal income. The ‘low’ level was set up to 30%, 'medium’ level was up to 70% and more than 70%(30% top of personal income) are categorized as 'high’.
Chi Square test of independence revealed that they were significantly associated, chi2: 195.14, p-value: 4.27e-43, dof: 2
Post hoc comparisons also told that any pairs among income categories are not associated.
Here’s the results.
========== chi-square test =================================
Income Level low medium high
Depression
0 9419 16032 12233
1 1051 1211 685
Income Level low medium high
Depression
0 0.899618 0.929769 0.946973
1 0.100382 0.070231 0.053027
*** chi-squre value, p-value, dof
chi2: 195.14
p-value: 4.23610993039e-43
dof: 2
Income Level low medium high
Depression
0 9419 16032 12233
1 1051 1211 685
Income Level low medium high
Depression
0 0.899618 0.929769 0.946973
1 0.100382 0.070231 0.053027
*** chi-squre value, p-value, dof
chi2: 195.14
p-value: 4.23610993039e-43
dof: 2
========== Post hoc test ===================================
low vs medium——————-
*** chi-squre value, p-value, dof
chi2: 78.60
p-value: 7.60330301415e-19
dof: 1
*** chi-squre value, p-value, dof
chi2: 78.60
p-value: 7.60330301415e-19
dof: 1
low vs high——————–
*** chi-squre value, p-value, dof
chi2: 188.03
p-value: 8.54030265154e-43
dof: 1
*** chi-squre value, p-value, dof
chi2: 188.03
p-value: 8.54030265154e-43
dof: 1
medium vs high——————–
*** chi-squre value, p-value, dof
chi2: 36.82
p-value: 1.2984866455e-09
dof: 1
*** chi-squre value, p-value, dof
chi2: 36.82
p-value: 1.2984866455e-09
dof: 1
피드 구독하기:
글 (Atom)
블로그 보관함
Categories
Labels
Advertising
Disqus Shortname
Popular Posts
-
유니티 튜토리얼 진행중에 둘의 차이점이 궁금해졌다. 어떨때는 gameObject로 사용하다가 어떨때는 GameObject로 사용하고.. 유니티 매뉴얼을 뒤져보면 GameObject는 Base Class로 Object Class를 상속 받고....
-
Get the hang of it, Get the knack of it! 인터넷 서핑중에 아는 단어들의 조합임에도 도무지 해석이 불가한 Idiom(?)을 발견했는데 직관적으로는 대채 무슨뜻인지 당췌 알 수가 없다. 또 잊어먹기전에 정리를 해두...
-
1. Make Mistake That's what I've done so far. 2. Scrap the Foreign Alphabet Make sense when it comes to English, however,...
-
“”“ S1Q10A: Total personal income in last 12M (Quantitative) SINTOCAT: Total personal imcome in last 12M (Categorical, Low: 0~30%, Medium: ...
-
It’s a sequel to the 1st assignment - study on the relationship b.t.w personal income and smoking amount. It’s already been verified thro...
-
이 블로그의 시작이 내인생에 있어서 또하나의 흐지부지한 결말중의 하나가 될 수도 있을거 같지만, 원래 성격상 호기심에 이것저것 자꾸 새로운 할 짓(?)을 찾아내서... 평소 관심사를 총망라해서 이것저것 작성해 볼 생각인데, 한두해 쌓이다 보면 내...
-
http://hotgott.tumblr.com/post/144729116411/python-code-test-a-basic-linear-regression
-
It's amazing and really intriguing me, in the near future, we could control another person via brainwaves. Of course, we'll need a b...
-
http://hotgott.tumblr.com/post/144886848546/study-on-relationship-between-personal-income