MechinLearning_kaggle heart disease

MechinLearning_kaggle heart disease'

資料來源: https://www.kaggle.com/datasets

import os
import sys
import numpy as np
import pandas as pd
import findspark

findspark.init()
import pyspark

sc = pyspark.SparkContext()
from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.classification import LogisticRegressionWithSGD
from pyspark.mllib.classification import LogisticRegressionWithLBFGS
from pyspark.mllib.tree import DecisionTree
from pyspark.mllib.tree import RandomForest
from pyspark.mllib.classification import SVMWithSGD
from pyspark.mllib.classification import NaiveBayes
from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pyspark.mllib.feature import StandardScaler


#匯入資料
heart = pd.read_csv('./heart.csv', header=0)
spark = SparkSession.builder.getOrCreate()
heart_df = spark.createDataFrame(heart)
heart_df = heart_df.withColumn("target", heart_df["target"].cast(IntegerType()))
heart_rdd = heart_df.rdd.map(list)


#LabeledPoint=>分成trainData  validationData testData
heart_rdd_label = heart_rdd.map(lambda r: LabeledPoint(r[-1], r[:-1]))
(trainData, validationData, testData) = heart_rdd_label.randomSplit([8, 1, 1])

print("trainData:" + str(trainData.count()) +
      "\nvalidationData:" + str(validationData.count()) +
      "\ntestData:" + str(testData.count())

     )


#Model 訓練

LRG = LogisticRegressionWithSGD.train(trainData, iterations = 100)
LRLBFGS = LogisticRegressionWithLBFGS.train(trainData, iterations = 100)
DT = DecisionTree.trainClassifier(
        trainData, numClasses=2, categoricalFeaturesInfo={}, \
        impurity="entropy", maxDepth=5, maxBins=5)

RF = RandomForest.trainClassifier(
        trainData, numClasses=2, categoricalFeaturesInfo={}, \
        impurity="entropy", maxDepth=5, maxBins=5, numTrees = 500)

NB = NaiveBayes.train(trainData)
SVMSGD = SVMWithSGD.train(trainData)

models = [LRG, LRLBFGS, DT, RF, NB, SVMSGD]


#畫出Predict的結果準確度 AUC
from itertools import repeat

AUCs = [evaluateModel_withoutTree(LRG, validationData), evaluateModel_withoutTree(LRLBFGS, validationData), 
       evaluateModelTree(DT, validationData), evaluateModelTree(RF, validationData), 
       evaluateModel_withoutTree(NB, validationData), evaluateModel_withoutTree(SVMSGD, validationData)
       ]

models_name = ['LogisticRegressionWithSGD', 'LogisticRegressionWithLBFGS', 
              'DecisionTree', 'RandomForest', 
              'NaiveBayes', 'SVMWithSGD']

print("LogisticRegressionWithSGD : {:.3f}".format(evaluateModel_withoutTree(LRG, validationData)))
print("LogisticRegressionWithLBFGS : {:.3f}".format(evaluateModel_withoutTree(LRLBFGS, validationData)))
print("DecisionTree : {:.3f}".format(evaluateModelTree(DT, validationData)))
print("RandomForest : {:.3f}".format(evaluateModelTree(RF, validationData)))
print("NaiveBayes : {:.3f}".format(evaluateModel_withoutTree(NB, validationData))
print("SVMWithSGD : {:.3f}".format(evaluateModel_withoutTree(SVMSGD, validationData)))

#SHOW
AUC = pd.DataFrame({'model' : models_name, 
                   'AUC' : AUCs})
AUC.set_index('model', inplace = True)
AUC
import matplotlib.pyplot as plt
%matplotlib inline
ax = AUC['AUC'].plot(kind='bar', title ='AUC of Models',
                legend=True, fontsize=12)

ax.set_xlabel('Model',fontsize=12)
ax.set_ylabel("AUC",fontsize=12)
plt.show()

留言