MechinLearning_kaggle heart disease

markdown ## MechinLearning_kaggle heart disease' 資料來源: [https://www.kaggle.com/datasets](https://www.kaggle.com/datasets) ``` import os import sys import numpy as np import pandas as pd import findspark findspark.init() import pyspark sc = pyspark.SparkContext() from pyspark.sql import SparkSession from pyspark.sql.types import IntegerType from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.classification import LogisticRegressionWithSGD from pyspark.mllib.classification import LogisticRegressionWithLBFGS from pyspark.mllib.tree import DecisionTree from pyspark.mllib.tree import RandomForest from pyspark.mllib.classification import SVMWithSGD from pyspark.mllib.classification import NaiveBayes from pyspark.mllib.evaluation import BinaryClassificationMetrics from pyspark.mllib.feature import StandardScaler #匯入資料 heart = pd.read_csv('./heart.csv', header=0) spark = SparkSession.builder.getOrCreate() heart_df = spark.createDataFrame(heart) heart_df = heart_df.withColumn("target", heart_df["target"].cast(IntegerType())) heart_rdd = heart_df.rdd.map(list) #LabeledPoint=>分成trainData validationData testData heart_rdd_label = heart_rdd.map(lambda r: LabeledPoint(r[-1], r[:-1])) (trainData, validationData, testData) = heart_rdd_label.randomSplit([8, 1, 1]) print("trainData:" + str(trainData.count()) + "\nvalidationData:" + str(validationData.count()) + "\ntestData:" + str(testData.count()) ) #Model 訓練 LRG = LogisticRegressionWithSGD.train(trainData, iterations = 100) LRLBFGS = LogisticRegressionWithLBFGS.train(trainData, iterations = 100) DT = DecisionTree.trainClassifier( trainData, numClasses=2, categoricalFeaturesInfo={}, \ impurity="entropy", maxDepth=5, maxBins=5) RF = RandomForest.trainClassifier( trainData, numClasses=2, categoricalFeaturesInfo={}, \ impurity="entropy", maxDepth=5, maxBins=5, numTrees = 500) NB = NaiveBayes.train(trainData) SVMSGD = SVMWithSGD.train(trainData) models = [LRG, LRLBFGS, DT, RF, NB, SVMSGD] #畫出Predict的結果準確度 AUC from itertools import repeat AUCs = [evaluateModel_withoutTree(LRG, validationData), evaluateModel_withoutTree(LRLBFGS, validationData), evaluateModelTree(DT, validationData), evaluateModelTree(RF, validationData), evaluateModel_withoutTree(NB, validationData), evaluateModel_withoutTree(SVMSGD, validationData) ] models_name = ['LogisticRegressionWithSGD', 'LogisticRegressionWithLBFGS', 'DecisionTree', 'RandomForest', 'NaiveBayes', 'SVMWithSGD'] print("LogisticRegressionWithSGD : {:.3f}".format(evaluateModel_withoutTree(LRG, validationData))) print("LogisticRegressionWithLBFGS : {:.3f}".format(evaluateModel_withoutTree(LRLBFGS, validationData))) print("DecisionTree : {:.3f}".format(evaluateModelTree(DT, validationData))) print("RandomForest : {:.3f}".format(evaluateModelTree(RF, validationData))) print("NaiveBayes : {:.3f}".format(evaluateModel_withoutTree(NB, validationData)) print("SVMWithSGD : {:.3f}".format(evaluateModel_withoutTree(SVMSGD, validationData))) #SHOW AUC = pd.DataFrame({'model' : models_name, 'AUC' : AUCs}) AUC.set_index('model', inplace = True) AUC import matplotlib.pyplot as plt %matplotlib inline ax = AUC['AUC'].plot(kind='bar', title ='AUC of Models', legend=True, fontsize=12) ax.set_xlabel('Model',fontsize=12) ax.set_ylabel("AUC",fontsize=12) plt.show() ```

留言