Assignment : Running Your First Program
import pandas
import numpy
data = pandas.read_csv('addhealth_pds.csv', low_memory=False)
print (len(data)) #number of observations (rows)
print (len(data.columns)) # number of variables (columns)
setting variables you will be working with to numeric
data['BIO_SEX'] = pandas.to_numeric(data['BIO_SEX'])
data['H1SU1'] = pandas.to_numeric(data['H1SU1'])
data['H1SU2'] = pandas.to_numeric(data['H1SU2'])
data['H1TO14'] = pandas.to_numeric(data['H1TO14'])
counts and percentages (i.e. frequency distributions) for each variable
c1 = data['BIO_SEX'].value_counts(sort=False)
print (c1)
p1 = data['BIO_SEX'].value_counts(sort=False, normalize=True)
print (p1)
c2 = data['H1SU1'].value_counts(sort=False)
print(c2)
p2 = data['H1SU1'].value_counts(sort=False, normalize=True)
print (p2)
c3 = data['H1SU2'].value_counts(sort=False)
print(c3)
p3 = data['H1SU2'].value_counts(sort=False, normalize=True)
print (p3)
c4 = data['H1TO15'].value_counts(sort=False)
print(c4)
print ('counts for BIO_SEX')
c1 = data['BIO_SEX'].value_counts(sort=False)
print (c1)
print (len(data['BIO_SEX'])) #number of observations (rows)
print ('percentages for BIO_SEX')
p1 = data['BIO_SEX'].value_counts(sort=False, normalize=True)
print (p1)
print ('counts for H1SU1')
c2 = data['H1SU1'].value_counts(sort=False)
print(c2)
print ('percentages for H1SU1')
p2 = data['H1SU1'].value_counts(sort=False, normalize=True)
print (p2)
print ('counts for H1SU2')
c3 = data['H1SU2'].value_counts(sort=False, dropna=False)
print(c3)
print ('percentages for H1SU2')
p3 = data['H1SU2'].value_counts(sort=False, normalize=True)
print (p3)
print ('counts for H1TO15')
c4 = data['H1TO15'].value_counts(sort=False, dropna=False)
print(c4)
print ('percentages for H1TO15')
p4 = data['H1TO15'].value_counts(sort=False, dropna=False, normalize=True)
print (p4)
ADDING MORE DESCRIPTIVE TITLES
print('counts for BIO_SEX“ what is the gender')
c1 = data['BIO_SEX'].value_counts(sort=False)
print (c1)
print('percentages for BIO_SEX what is the gender')
p1 = data['BIO_SEX'].value_counts(sort=False, normalize=True)
print (p1)
print('counts for H1SU1 seriosly thinking about suicide in the last 12 months')
c2 = data['H1SU1'].value_counts(sort=False)
print(c2)
print('percentages for H1SU1 seriosly thinking about suicide in the last 12 months')
p2 = data['H1SU1'].value_counts(sort=False, normalize=True)
print (p2)
print('counts for H1SU2 attempting commiting suicide in the last 12 months')
c3 = data['H1SU2'].value_counts(sort=False)
print(c3)
print('percentages for H1SU2 attempting commiting suicide in the last 12 months')
p3 = data['H1SU2'].value_counts(sort=False, normalize=True)
print (p3)
print('counts for H1TO15 how many times a person thinks about alcohol during the past 12 months')
c4 = data['H1TO15'].value_counts(sort=False, dropna=False)
print(c4)
print('percentages for H1TO15 how many times a person thinks about alcohol during the past 12 months')
p4 = data['H1TO15'].value_counts(sort=False, normalize=True)
print (p4)
frequency distributions using the 'bygroup' function
ct1= data.groupby('BIO_SEX').size()
print(ct1)
pt1 = data.groupby('BIO_SEX').size() * 100 / len(data)
print(pt1)
subset data to male attempting commiting suicide in the last 12 months
sub1=data[(data['BIO_SEX']==1) & (data['H1SU1']==1)]
make a copy of my new subsetted data
frequency distributions on new sub2 data frame
print('counts for BIO_SEX')
c5 = sub2['BIO_SEX'].value_counts(sort=False)
print(c5)
print('percentages for BIO_SEX')
p5 = sub2['BIO_SEX'].value_counts(sort=False, normalize=True)
print (p5)
print('counts for H1SU1')
c6 = sub2['H1TO13'].value_counts(sort=False)
print(c6)
print('percentages for H1SU1')
p6 = sub2['H1SU1'].value_counts(sort=False, normalize=True)
print (p6)
upper-case all DataFrame column names - place afer code for loading data aboave
data.columns = list(map(str.upper, data.columns))
bug fix for display formats to avoid run time errors - put after code for loading data above
pandas.set_option('display.float_format', lambda x:'%f'%x)
Results:
In this part we see how many male, how many female and undefined respondents there are, and what is that number expressed in percentages
Refining research question:
Research question is how many men seriously thought about committing suicide in the previous 12 months?