markdown
## MechinLearning_kaggle heart disease'
資料來源: [https://www.kaggle.com/datasets](https://www.kaggle.com/datasets)
```
import os
import sys
import numpy as np
import pandas as pd
import findspark
findspark.init()
import pyspark
sc = pyspark.SparkContext()
from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.classification import LogisticRegressionWithSGD
from pyspark.mllib.classification import LogisticRegressionWithLBFGS
from pyspark.mllib.tree import DecisionTree
from pyspark.mllib.tree import RandomForest
from pyspark.mllib.classification import SVMWithSGD
from pyspark.mllib.classification import NaiveBayes
from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pyspark.mllib.feature import StandardScaler
#匯入資料
heart = pd.read_csv('./heart.csv', header=0)
spark = SparkSession.builder.getOrCreate()
heart_df = spark.createDataFrame(heart)
heart_df = heart_df.withColumn("target", heart_df["target"].cast(IntegerType()))
heart_rdd = heart_df.rdd.map(list)
#LabeledPoint=>分成trainData validationData testData
heart_rdd_label = heart_rdd.map(lambda r: LabeledPoint(r[-1], r[:-1]))
(trainData, validationData, testData) = heart_rdd_label.randomSplit([8, 1, 1])
print("trainData:" + str(trainData.count()) +
"\nvalidationData:" + str(validationData.count()) +
"\ntestData:" + str(testData.count())
)
#Model 訓練
LRG = LogisticRegressionWithSGD.train(trainData, iterations = 100)
LRLBFGS = LogisticRegressionWithLBFGS.train(trainData, iterations = 100)
DT = DecisionTree.trainClassifier(
trainData, numClasses=2, categoricalFeaturesInfo={}, \
impurity="entropy", maxDepth=5, maxBins=5)
RF = RandomForest.trainClassifier(
trainData, numClasses=2, categoricalFeaturesInfo={}, \
impurity="entropy", maxDepth=5, maxBins=5, numTrees = 500)
NB = NaiveBayes.train(trainData)
SVMSGD = SVMWithSGD.train(trainData)
models = [LRG, LRLBFGS, DT, RF, NB, SVMSGD]
#畫出Predict的結果準確度 AUC
from itertools import repeat
AUCs = [evaluateModel_withoutTree(LRG, validationData), evaluateModel_withoutTree(LRLBFGS, validationData),
evaluateModelTree(DT, validationData), evaluateModelTree(RF, validationData),
evaluateModel_withoutTree(NB, validationData), evaluateModel_withoutTree(SVMSGD, validationData)
]
models_name = ['LogisticRegressionWithSGD', 'LogisticRegressionWithLBFGS',
'DecisionTree', 'RandomForest',
'NaiveBayes', 'SVMWithSGD']
print("LogisticRegressionWithSGD : {:.3f}".format(evaluateModel_withoutTree(LRG, validationData)))
print("LogisticRegressionWithLBFGS : {:.3f}".format(evaluateModel_withoutTree(LRLBFGS, validationData)))
print("DecisionTree : {:.3f}".format(evaluateModelTree(DT, validationData)))
print("RandomForest : {:.3f}".format(evaluateModelTree(RF, validationData)))
print("NaiveBayes : {:.3f}".format(evaluateModel_withoutTree(NB, validationData))
print("SVMWithSGD : {:.3f}".format(evaluateModel_withoutTree(SVMSGD, validationData)))
#SHOW
AUC = pd.DataFrame({'model' : models_name,
'AUC' : AUCs})
AUC.set_index('model', inplace = True)
AUC
import matplotlib.pyplot as plt
%matplotlib inline
ax = AUC['AUC'].plot(kind='bar', title ='AUC of Models',
legend=True, fontsize=12)
ax.set_xlabel('Model',fontsize=12)
ax.set_ylabel("AUC",fontsize=12)
plt.show()
```
留言
張貼留言