通過散點圖,觀察一下數據點的空間分布情況,代碼如下:
1
2
3from numpy import *
4import matplotlib.pyplot as plt
5
6def loadData():
7 train_x = []
8 train_y = []
9 fileIn = open('testSet.txt')
10 for line in fileIn.readlines():
11 lineArr = line.strip().split()
12 train_x.append([1.0, float(lineArr[0]), float(lineArr[1])])
13 train_y.append(float(lineArr[2]))
14 return mat(train_x), mat(train_y).transpose()
15
16train_x,train_y= loadData()
17for i in xrange(100):
18 if int(train_y[i, 0]) == 0:
19 plt.plot(train_x[i, 1], train_x[i, 2], 'or')
20 elif int(train_y[i, 0]) == 1:
21 plt.plot(train_x[i, 1], train_x[i, 2], 'ob')
22min_x = min(train_x[:, 1])[0, 0]
23max_x = max(train_x[:, 1])[0, 0]
24plt.xlabel('X1'); plt.ylabel('X2')
25plt.show()
運行python腳本畫出來的測試數據的散點圖如圖3所示:
圖3 測試數據散點圖
從圖3中,可以看出測試數據基本可以通過一條直線進行分隔,下邊筆者的邏輯回歸python代碼將通過用一條直線將數據分成兩部分,代碼中的優化算法可以選擇梯度下降法、隨機梯度下降和平滑梯度下降三種方法。模型的代碼如下(讀者可將模型代碼複製入logic.py文件中):
1from numpy import *
2import matplotlib.pyplot as plt
3import time
4
5
6def sigmoid(inX):
7 return 1.0 / (1 + exp(-inX))
8
9
10
11
12
13
14def trainLogRegres(train_x, train_y, opts):
15
16 startTime = time.time()
17
18 numSamples, numFeatures = shape(train_x)
19 alpha = opts['alpha']; maxIter = opts['maxIter']
20 weights = ones((numFeatures, 1))
21
22
23 for k in range(maxIter):
24 if opts['optimizeType'] == 'gradDescent':
25 output = sigmoid(train_x * weights)
26 error = train_y - output
27 weights = weights + alpha * train_x.transpose() * error
28 elif opts['optimizeType'] == 'stocGradDescent':
29 for i in range(numSamples):
30 output = sigmoid(train_x[i, :] * weights)
31 error = train_y[i, 0] - output
32 weights = weights + alpha * train_x[i, :].transpose() * error
33 elif opts['optimizeType'] == 'smoothStocGradDescent':
34
36 for i in range(numSamples):
37 alpha = 4.0 / (1.0 + k + i) + 0.01
38 randIndex = int(random.uniform(0, len(dataIndex)))
39 output = sigmoid(train_x[randIndex, :] * weights)
40 error = train_y[randIndex, 0] - output
41 weights = weights + alpha * train_x[randIndex, :].transpose() * error
42 del(dataIndex[randIndex])
43 else:
44 raise NameError('Not support optimize method type!')
45
46
47 print 'Congratulations, training complete! Took %fs!' % (time.time() - startTime)
48 return weights
49
50
51
52
53def testLogRegres(weights, test_x, test_y):
54 numSamples, numFeatures = shape(test_x)
55 matchCount = 0
56 for i in xrange(numSamples):
57 predict = sigmoid(test_x[i, :] * weights)[0, 0] > 0.5
58 if predict == bool(test_y[i, 0]):
59 matchCount += 1
60 accuracy = float(matchCount) / numSamples
61 return accuracy
62
63
64
65
66def showLogRegres(weights, train_x, train_y):
67
68 numSamples, numFeatures = shape(train_x)
69 if numFeatures != 3:
70 print "Sorry! I can not dra
71 return 1
72
73
74 for i in xrange(numSamples):
75 if int(train_y[i, 0]) == 0:
76 plt.plot(train_x[i, 1], train_x[i, 2], 'or')
77 elif int(train_y[i, 0]) == 1:
78 plt.plot(train_x[i, 1], train_x[i, 2], 'ob')
79
80
81 min_x = min(train_x[:, 1])[0, 0]
82 max_x = max(train_x[:, 1])[0, 0]
83 weights = weights.getA()
84 y_min_x = float(-weights[0] - weights[1] * min_x) / weights[2]
85 y_max_x = float(-weights[0] - weights[1] * max_x) / weights[2]
86 plt.plot([min_x, max_x], [y_min_x, y_max_x], '-g')
87 plt.xlabel('X1'); plt.ylabel('X2')
88plt.show()
對代碼的分類效果進行測試,測試代碼如下(讀者可將測試代碼複製入test_logic.py文件中):
1from numpy import *
2import matplotlib.pyplot as plt
3import time
4import logic as logic
5
6def loadData():
7 train_x = []
8 train_y = []
9
10 fileIn = open('testSet.txt')
11 for line in fileIn.readlines():
12 lineArr = line.strip().split()
13 train_x.append([1.0, float(lineArr[0]), float(lineArr[1])])
14 train_y.append(float(lineArr[2]))
15 return mat(train_x), mat(train_y).transpose()
16
17
18print "step 1: load data..."
19train_x, train_y = loadData()
20
21
22test_x = train_x; test_y = train_y
23
24print "step 2: training..."
25opts = {'alpha': 0.01, 'maxIter': 5, 'optimizeType': 'smoothStocGradDescent'}
26
27optimalWeights = logic.trainLogRegres(train_x, train_y, opts)
28
29print optimalWeights
30
31print "step 3: testing..."
32accuracy = logic.testLogRegres(optimalWeights, test_x, test_y)
33
34print "step 4: show the result..."
35print 'The classify accuracy is: %.3f%%' % (accuracy * 100)
36logic.showLogRegres(optimalWeights, train_x, train_y)
運行測試代碼test_logic.py,測試結果如圖4所示:
圖4 邏輯回歸線性分類效果
筆者統計了一下,在迭代5次的情況下,線性分類的正確率為96%。讀者可以通過複製代碼,自行的進行測試,也可以將線性分類改變成曲線形式,去測一下分類效果。在這篇文章中筆者簡單介紹了一下邏輯回歸的原理,並且給出了實現的代碼以供讀者直觀學習體驗。在下一篇,本公眾號中關於機器學習文章中,筆者將詳細的介紹支持向量機SVM的原理,並附上python實現的代碼。
下篇: