from pandas import Series, DataFrame
import pandas as pd
import numpy as np
import os
import matplotlib.pylab as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
import sklearn.metrics
# Feature Importance
from sklearn import datasets
from sklearn.ensemble import ExtraTreesClassifier
data = pd.read_csv("C:\Users\guy3404\OneDrive - MDLZ\Documents\Cross Functional Learning\AI COP\Coursera\machine_learning_data_analysis\Datasets\tree_addhealth.csv")
Getting information aboubt the dataset
We observe some of the columns of the dataset contains null values . We need to drop them
Drop null values from dataset
data_clean = data.dropna()
Length of dataset after dropping null values
Split into training and testing sets
predictors = data_clean[['BIO_SEX','HISPANIC','WHITE','BLACK','NAMERICAN','ASIAN','age',
'ALCEVR1','ALCPROBS1','marever1','cocever1','inhever1','cigavail','DEP1','ESTEEM1','VIOL1',
'PASSIST','DEVIANT1','SCHCONN1','GPA1','EXPEL1','FAMCONCT','PARACTV','PARPRES']]
targets = data_clean.TREG1
pred_train, pred_test, tar_train, tar_test = train_test_split(predictors, targets, test_size=.4)
pred_train.shape
pred_test.shape
tar_train.shape
tar_test.shape
Build model on training data
from sklearn.ensemble import RandomForestClassifier
classifier=RandomForestClassifier(n_estimators=25)
classifier=classifier.fit(pred_train,tar_train)
predict using random forest classifier on test data
predictions=classifier.predict(pred_test)
Print confusion matrix and accuracy score
sklearn.metrics.confusion_matrix(tar_test,predictions)
sklearn.metrics.accuracy_score(tar_test, predictions)
fit an Extra Trees model to the data
model = ExtraTreesClassifier()
model.fit(pred_train,tar_train)
feature_importances = model.feature_importances_
Create a Series with feature importances and corresponding feature names
feature_importance_series = pd.Series(feature_importances, index=pred_train.columns)
Sort features based on importance
sorted_feature_importance = feature_importance_series.sort_values(ascending=False)
Plot the feature importances
plt.figure(figsize=(10, 6))
sorted_feature_importance.plot(kind='barh')
plt.title('Feature Importance')
plt.xlabel('Importance Score')
plt.show()
#Running different number of trees and see the effect
of that on the accuracy of the prediction
trees=range(25)
accuracy=np.zeros(25)
for idx in range(len(trees)):
classifier=RandomForestClassifier(n_estimators=idx + 1)
classifier=classifier.fit(pred_train,tar_train)
predictions=classifier.predict(pred_test)
accuracy[idx]=sklearn.metrics.accuracy_score(tar_test, predictions)
plt.cla()
plt.plot(trees, accuracy)
Random forest analysis was performed to evaluate the importance of series of variables in predicting whether a person is a regular smoker or not. We observed that out of all features, marijuana use has the highest feature importance, followed by deviance and GPA. The random model could predict with an accuracy score of 85%.