### 1 讀取UCI機器學習中的數據
url <- "http://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data"
data <- read.csv(url, header=FALSE)
# 查看數據
head(data) # 該數據不含有列名,我們很難知道每列代表的意義
# V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14
# 1 63 1 1 145 233 1 2 150 0 2.3 3 0.0 6.0 0
# 2 67 1 4 160 286 0 2 108 1 1.5 2 3.0 3.0 2
# 3 67 1 4 120 229 0 2 129 1 2.6 2 2.0 7.0 1
# 4 37 1 3 130 250 0 0 187 0 3.5 3 0.0 3.0 0
# 5 41 0 2 130 204 0 2 172 0 1.4 1 0.0 3.0 0
# 6 56 1 2 120 236 0 0 178 0 0.8 1 0.0 3.0 0
# 根據機器學習倉庫中的數據注釋添加列名,便於解讀數據的意義
colnames(data) <- c(
"age",
"sex",# 0 = female, 1 = male
"cp", # chest pain
# 1 = typical angina,
# 2 = atypical angina,
# 3 = non-anginal pain,
# 4 = asymptomatic
"trestbps", # resting blood pressure (in mm Hg)
"chol", # serum cholestoral in mg/dl
"fbs", # fasting blood sugar if less than 120 mg/dl, 1 = TRUE, 0 = FALSE
"restecg", # resting electrocardiographic results
# 1 = normal
# 2 = having ST-T wave abnormality
# 3 = showing probable or definite left ventricular hypertrophy
"thalach", # maximum heart rate achieved
"exang", # exercise induced angina, 1 = yes, 0 = no
"oldpeak", # ST depression induced by exercise relative to rest
"slope", # the slope of the peak exercise ST segment
# 1 = upsloping
# 2 = flat
# 3 = downsloping
"ca", # number of major vessels (0-3) colored by fluoroscopy
"thal", # this is short of thalium heart scan
# 3 = normal (no cold spots)
# 6 = fixed defect (cold spots during rest and exercise)
# 7 = reversible defect (when cold spots only appear during exercise)
"hd" # (the predicted attribute) - diagnosis of heart disease 心臟病的診斷屬性
# 0 if less than or equal to 50% diameter narrowing
# 1 if greater than 50% diameter narrowing
)
### 查看增加列名後的數據
head(data)
# age sex cp trestbps chol fbs restecg thalach exang oldpeak slope ca thal hd
# 1 63 1 1 145 233 1 2 150 0 2.3 3 0.0 6.0 0
# 2 67 1 4 160 286 0 2 108 1 1.5 2 3.0 3.0 2
# 3 67 1 4 120 229 0 2 129 1 2.6 2 2.0 7.0 1
### 查看數據的整體情況
# 'data.frame': 303 obs. of 14 variables:
# $ age : num 63 67 67 37 41 56 62 57 63 53 ... #年齡數值變量
# $ sex : num 1 1 1 1 0 1 0 0 1 1 ... #性別,數值型變量(應改為因子型變量)
# $ cp : num 1 4 4 3 2 2 4 4 4 4 ... #胸痛情況(應改為因子型變量)
# $ trestbps: num 145 160 120 130 130 120 140 120 130 140 ...
# $ chol : num 233 286 229 250 204 236 268 354 254 203 ...
# $ fbs : num 1 0 0 0 0 0 0 0 0 1 ... #空腹血糖是否滿足小於120 mg/dl(應改為因子型變量)
# .
# .
2. 數據預處理(確保數據能夠在程序中被正常識別)數據類型的整理:將分類變量設置為factor,連續型變量為num保持不變,刪除缺失值記錄。
### 2 數據預處理
# str(data) 查看數據的整體情況,判斷哪些變量需要進行類型轉換
## "?"為字符型數據,將"?"轉換為NA
data[data == "?"] <- NA
## 將分類變量設置為factor,連續型變量為num保持不變
# 這一步是確保數據能夠在程序中被正確識別
data[data$sex == 0,]$sex <- "F"
data[data$sex == 1,]$sex <- "M"
data$sex <- as.factor(data$sex)
data$cp <- as.factor(data$cp)
data$fbs <- as.factor(data$fbs)
data$restecg <- as.factor(data$restecg)
data$exang <- as.factor(data$exang)
data$slope <- as.factor(data$slope)
data$ca <- as.integer(data$ca) #R將"?"認為是字符型,故需要修正為整數型
data$ca <- as.factor(data$ca) #將整數型改為因子型
data$thal <- as.integer(data$thal) # "thal"中也有"?",處理同ca
data$thal <- as.factor(data$thal)
## 將hd(heart disease)中的數字(0~4)改成"Healthy" 和"Unhealthy"
data$hd <- ifelse(test=data$hd == 0, yes="Healthy", no="Unhealthy")
data$hd <- as.factor(data$hd) str(data) #再次核驗數據類型再次查看轉換後的數據類型:所有的分類變量被轉換成因子型(factor),所有的連續變量保持數值型(num)。
NA值的處理:如果NA較少,可以直接刪除;如果NA較多,應該使用隨機森林或其他方法替代NA。
## 查看NA個數
3. 數據質量控制
table(is.na(data))
# FALSE TRUE
# 4236 6
##展示含有NA的記錄
data[!complete.cases(data),]
# age sex cp trestbps chol fbs restecg thalach exang oldpeak slope ca thal hd
# 88 53 F 3 128 216 0 2 115 0 0.0 1 2 <NA> Healthy
# 167 52 M 3 138 223 0 0 169 0 0.0 1 <NA> 2 Healthy
# 193 43 M 4 132 247 1 2 143 1 0.1 2 <NA> 4 Unhealthy
# 267 52 M 4 128 204 1 0 156 1 1.0 2 2 <NA> Unhealthy
# 288 58 M 2 125 220 0 0 144 0 0.4 2 <NA> 4 Healthy
# 303 38 M 3 138 175 0 0 173 0 0.0 1 <NA> 2 Healthy
##去除含NA的樣本(6個樣本),僅保留完整數據的樣本(297個樣本)
data <- na.omit(data)
nrow(data)
# [1] 297### 3 數據質量控制
# 確保因子型數據中的每一個類別都有患者和非患者
# 刪除僅含有1-2個樣本的類別。因為其對odds或log(odds)有較大的效應量
xtabs(~ hd + sex, data=data)
xtabs(~ hd + cp, data=data)
xtabs(~ hd + fbs, data=data)
xtabs(~ hd + exang, data=data)
xtabs(~ hd + slope, data=data)
xtabs(~ hd + ca, data=data)
xtabs(~ hd + thal, data=data)
xtabs(~ hd + restecg, data=data) ##restecg事件的類別1僅含有4個樣本。暫時保留,觀察其對結局判定的影響
# restecg
#hd 0 1 2
# Healthy 92 1 67
# Unhealthy 55 3 79
4. 簡單logistic回歸(基於單一變量性別預測是否患有心臟病)# 4 簡單logistic回歸:基於單一變量(性別)預測患者是否患有心臟病(hd)
## 4.1 查看原始數據
xtabs(~ hd + sex, data=data)
# sex
# hd F M
# Healthy 71 89
# Unhealthy 25 112
## 大多數男性是患者,而大多數女性不是患者
## 男性是患者的優勢比更大(112/89);女性是患者的優勢比較小(25/71)
## 換句話說,男性增加心臟病發生的優勢,男性更容易發生心臟病glm()執行logistic回歸:
## 4.2 進行logistic回歸
## glm()是執行廣義線性模型(generalized linear models)的函數,
## 「hd~sex」表示基於性別預測hd
## 指定binomial family產生廣義線性模型,使glm()執行logistic回歸而不是其他類型的廣義線性模型
## glm()函數一步搞定logistic回歸
logistic <- glm(hd ~ sex, data=data, family="binomial")summary(logistic)查看logistic回歸結果:
##查看logistic回歸結果
summary(logistic)
# Call:
# glm(formula = hd ~ sex, family = "binomial", data = data)
#
# Deviance Residuals:
# Min 1Q Median 3Q Max
# -1.2765 -1.2765 -0.7768 1.0815 1.6404
#
# Coefficients:
# Estimate Std. Error z value Pr(>|z|)
# (Intercept) -1.0438 0.2326 -4.488 7.18e-06 ***
# sexM 1.2737 0.2725 4.674 2.95e-06 ***
# ---
# Signif. codes: 0 『***』 0.001 『**』 0.01 『*』 0.05 『.』 0.1 『 』 1
#
# (Dispersion parameter for binomial family taken to be 1)
#
# Null deviance: 409.95 on 296 degrees of freedom
# Residual deviance: 386.12 on 295 degrees of freedom
# AIC: 390.12
#
# Number of Fisher Scoring iterations: 4logistic回歸結果解讀:
第一個係數:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.0438 0.2326 -4.488 7.18e-06 ***
截距係數指的是女性患者發生心臟病的優勢比的對數,即log(OR)。因子水平:因為在設置因子時,R默認按照字母表順序進行排序:female(0),male(1)。故female為基線水平,即截距水平。female.log.odds <- log(25 / 71)第二個係數:
Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.0438 0.2326 -4.488 7.18e-06 ***
## sexM 1.2737 0.2725 4.674 2.95e-06 ***
係數的指數化,增加係數的可讀性exp(logistic$coefficients[1])
計算偽R2和p值
# 0.3521127
# 表明女性發生心臟病的優勢比為0.35
exp(logistic$coefficients[2])
# 3.573933
# 表明男性發生心臟病的優勢比女性增加3.57倍## 參考: 飽和模型與偏差計算R方與p值## logistic回歸中:
## Null devaince = 2*(0 - LogLikelihood(null model))
## = -2*LogLikihood(null model)
## Residual deviacne = 2*(0 - LogLikelihood(proposed model))
## = -2*LogLikelihood(proposed model)
LL.null <- logistic$null.deviance/-2 #-204.9732
LL.proposed <- logistic$deviance/-2 #-193.059
## McFadden's Pseudo R^2 = [ LL(Null) - LL(Proposed) ] / LL(Null)
(LL.null - LL.proposed) / LL.null # 0.05812569
## chi-square value = 2*(LL(Proposed) - LL(Null))
## p-value = 1 - pchisq(chi-square value, df = 2-1)
1 - pchisq(2*(LL.proposed - LL.null), df=1) #1.053157e-06
1 - pchisq((logistic$null.deviance - logistic$deviance), df=1) #1.053157e-06
查看logistic回歸的預測結果(基於性別預測是否患病的結果)predicted.data <- data.frame(
probability.of.hd=logistic$fitted.values,
sex=data$sex)
str(predicted.data)
# 'data.frame': 297 obs. of 2 variables:
# $ probability.of.hd: num 0.557 0.557 0.557 0.557 0.26 ...
# $ sex : Factor w/ 2 levels "F","M": 2 2 2 2 1 2 1 1 2 2 ...利用ggplot2 R 包將預測結果可視化:
library("ggplot2")
ggplot(data=predicted.data, aes(x=sex, y=probability.of.hd)) +
geom_point(aes(color=sex), size=5) +
xlab("Sex") +
ylab("Predicted probability of getting heart disease")
## 在預測結果中,僅有兩個概率值(男性一個,女性一個)
## 故可使用交叉表進行匯總預測結果
xtabs(~ probability.of.hd + sex, data=predicted.data)
# sex
# probability.of.hd F M
# 0.260416666667241 96 0
# 0.55721393034826 0 201基於該模型,我們僅能根據性別判斷出兩個概率,女性發病的概率較低(p=0.26),而男性發病的概率較高(p=0.56)。
5. 較多變量的logistic回歸
##"hd~."表示使用數據中所有變量
logistic <- glm(hd ~ ., data=data, family="binomial")
summary(logistic)
# Call:
# glm(formula = hd ~ ., family = "binomial", data = data)
#
# Deviance Residuals:
# Min 1Q Median 3Q Max
# -3.0490 -0.4847 -0.1213 0.3039 2.9086
#
# Coefficients:
# Estimate Std. Error z value Pr(>|z|)
# (Intercept) -6.253978 2.960399 -2.113 0.034640 *
# age -0.023508 0.025122 -0.936 0.349402
# sexM 1.670152 0.552486 3.023 0.002503 **
# cp2 1.448396 0.809136 1.790 0.073446 .
# cp3 0.393353 0.700338 0.562 0.574347
# cp4 2.373287 0.709094 3.347 0.000817 ***
# trestbps 0.027720 0.011748 2.359 0.018300 *
# chol 0.004445 0.004091 1.087 0.277253
# fbs1 -0.574079 0.592539 -0.969 0.332622
# restecg1 1.000887 2.638393 0.379 0.704424
# restecg2 0.486408 0.396327 1.227 0.219713
# thalach -0.019695 0.011717 -1.681 0.092781 .
# exang1 0.653306 0.447445 1.460 0.144267
# oldpeak 0.390679 0.239173 1.633 0.102373
# slope2 1.302289 0.486197 2.679 0.007395 **
# slope3 0.606760 0.939324 0.646 0.518309
# ca3 2.237444 0.514770 4.346 1.38e-05 ***
# ca4 3.271852 0.785123 4.167 3.08e-05 ***
# ca5 2.188715 0.928644 2.357 0.018428 *
# thal3 -0.168439 0.810310 -0.208 0.835331
# thal4 1.433319 0.440567 3.253 0.001141 **
# ---
# Signif. codes: 0 『***』 0.001 『**』 0.01 『*』 0.05 『.』 0.1 『 』 1
#
# (Dispersion parameter for binomial family taken to be 1)
#
# Null deviance: 409.95 on 296 degrees of freedom
# Residual deviance: 183.10 on 276 degrees of freedom
# AIC: 225.1
#
# Number of Fisher Scoring iterations: 6輸出的結果較多,但是解釋的方法同前。一些變量的係數具有顯著性,一些變量的係數不具有顯著性。
## 計算該模型的 "Pseudo R-squared"和 p-value
ll.null <- logistic$null.deviance/-2
ll.proposed <- logistic$deviance/-2
## McFadden's Pseudo R^2 = [ LL(Null) - LL(Proposed) ] / LL(Null)
(ll.null - ll.proposed) / ll.null
# [1] 0.5533531
##R^2的P值
1 - pchisq(2*(ll.proposed - ll.null), df=(length(logistic$coefficients)-1))
# [1] 0
# p值非常小,趨近0
## 可視化該模型的預測結果
predicted.data <- data.frame(
probability.of.hd=logistic$fitted.values,
hd=data$hd)
str(predicted.data)
# 'data.frame': 297 obs. of 2 variables:
# $ probability.of.hd: num 0.0621 0.9934 0.9979 0.1136 0.0263 ...
# $ hd : Factor w/ 2 levels "Healthy","Unhealthy": 1 2 2 1 1 1 2 1 2 2 ..
## 對預測結果進行排序:按照預測發生心臟病的概率大小
predicted.data <- predicted.data[
order(predicted.data$probability.of.hd, decreasing=FALSE),]
predicted.data$rank <- 1:nrow(predicted.data)
## 可視化預測結果:不同的顏色展示患病或不患病的真實分類
ggplot(data=predicted.data, aes(x=rank, y=probability.of.hd)) +
geom_point(aes(color=hd), alpha=1, shape=4, stroke=2) +
xlab("Index") +
ylab("Predicted probability of getting heart disease")
ggsave("heart_disease_probabilities.pdf")
基於較複雜的模型,得出一個連續型的模型。總體而言,該模型能夠將絕大部分的患者判定為患者,其對應的患病的p值>0.5;絕大部分的非患者被判定為非患者,其對應患病的p值<0.5。參考視頻:https://www.youtube.com/watch?v=C4N3_XJJ-jU編輯:呂瓊
校審:羅鵬