通過輸入已有的數值對(x,y),建立模型,再次輸入x的值時可以預測y的值
代碼如下:
from math import sqrtfrom random import seedfrom random import randrangefrom csv import readerfrom math import sqrtfrom matplotlib import pyplot as pltimport numpy as np #計算平均值def mean(values): return sum(values) / float(len(values)) # 計算方差def variance(values, mean): return sum([(x - mean) ** 2 for x in values]) #計算協方差def convariance(x, mean_x, y, mean_y): convar = 0.0 for i in range(len(x)): convar += (x[i] - mean_x) * (y[i] - mean_y) return convar #計算回歸係數函數def coefficients(dataset): x = [row[0] for row in dataset] y = [row[1] for row in dataset] x_mean, y_mean = mean(x), mean(y) w1 = convariance(x, x_mean, y, y_mean) / variance(x, x_mean) w0 = y_mean - w1 * x_mean return w0, w1 #構建簡單的線性回歸def simple_linear_regression(train, test): predict = list() #構建空列表 w0, w1 = coefficients(train) #從訓練集合中獲取回歸係數 global w_k w_k = w1 global w_b w_b = w0 for row in test: #從測試集中讀取每一個不同的x y_model = w1 * row[0] + w0 #用模型預測y predict.append(y_model) #記錄每一個預測值y return predict #計算均方根誤差RMSEdef rmse_metric(actual, predicted): sum_error = 0.0 for i in range(len(actual)): prediction_error = predicted[i] - actual[i] sum_error += (prediction_error ** 2) mean_error = sum_error / float(len(actual)) return sqrt(mean_error) #評估算法數據準備及協調def evaluate_algorithm(dataset, algorithm): test_set = list() for row in dataset: row_copy = list(row) row_copy[-1] = None test_set.append(row_copy) predicted = algorithm(dataset, test_set) for val in predicted: print('%.3f\t' % val) actual = [row[-1] for row in dataset] rmse = rmse_metric(actual, predicted) return rmse #導入CSV文件def load_csv(filename): dataset = list() try: with open(filename, 'r') as file: csv_reader = reader(file) #讀取表頭X,Y heading = next(csv_reader) #將文件指針下移至第一條真正的數據 for row in csv_reader: if not row : #判定是否有空行,如果有,則跳入下一行,繼續讀取數據 continue dataset.append(row) except IOError as err: print("file error :",str(err)) return dataset #將字符串列轉換為浮點數def str_column_to_float(dataset, column): for row in dataset: row[column] = float(row[column].strip()) #將數據集分割為訓練集合和測試兩部分def train_test_split(dataset, percent): train = list() train_size = percent * len(dataset) dataset_copy = list(dataset) while len(train) < train_size: index = randrange(len(dataset_copy)) train.append(dataset_copy.pop(index)) return train, dataset_copy #使用分隔開的訓練集合和測試集合運行評估算法def evaluate_algorithm(dataset, algorithm, split_percent, *args): train, test = train_test_split(dataset, split_percent) test_set = list() for row in test : row_copy = list(row) row_copy[-1] = None test_set.append(row_copy) predicted = algorithm(train, test_set, *args) actual = [row[-1] for row in test] rmse = rmse_metric(actual, predicted) return rmse def visualization(dateset): fig = plt.figure() # 畫圖區域分成1行1列。選擇第一塊區域。 ax1 = fig.add_subplot(1, 1, 1) x = [row[0] for row in dataset] y = [row[1] for row in dataset] ax1.plot(x, y,'bs') x_1 = np.linspace(min(x),max(x)) y_1 = x_1 * w_k + w_b ax1.plot(x_1,y_1) plt.grid() plt.show() #設置隨機數種子,為隨機數訓練和測試數據集做準備seed(2) #導入保險數據並做數據分割準備filename = 'insurance.csv'dataset = load_csv(filename)print(dataset)for col in range(len(dataset[0])): str_column_to_float(dataset, col) #設置數據集合分割百分比percent = 0.6 rmse = evaluate_algorithm(dataset, simple_linear_regression, percent)print('RMSE : %.3f' % rmse)visualization(dataset)# dataset = [[1.2, 1.1], [2.4, 3.5], [4.1, 3.2], [3.4, 2.8], [5, 5.4]]# x = [row[0] for row in dataset]# y = [row[1] for row in dataset]# mean_x, mean_y = mean(x), mean(y) #獲取均值# var_x, var_y = variance(x, mean_x), variance(y, mean_y)# convar = convariance(x, mean_x, y, mean_y) #獲取協方差## print('x的統計特性:均值 = % .3f 方差 = %.3f' % (mean_x, var_x))# print('y的統計特性:均值 = % .3f 方差 = %.3f' % (mean_y, var_y))# print('協方差 = :%.3f' % convar)## w0, w1 = coefficients(dataset)# print('回歸係數分別為: w0 = %.3f, w1 = %.3f' % (w0, w1))## rmse = evaluate_algorithm(dataset, simple_linear_regression)# print('RMSE : %.3f' % rmse)代碼中的insurance.csv文件需要與代碼文件在同一個目錄下
文件內容如下:
運行結果: