1. 數據獲取和預處理
import json as jsonimport pandas as pdimport numpy as np
delay= json.loads('xxxx')df=pd.DataFrame(delay['data'])df['dateTime']=pd.to_datetime(df['entry_time']+8*3600*1000,unit='ms')data = df[['dateTime','latency','jitter','loss_percentage']]2. 數據可視化觀測
大多數人會教你直接用Pandas Plot()畫圖,但是你們看看這圖闊以麼。。。反正我是看了想吐的,我在自己的私募基金量化算法庫中做了一個稍微好看點的Bokeh Wrapper.畫出來的圖如下,還可以拖拉縮放選擇區域,為了不影響閱讀,這個Wrapper的代碼見本文最後...
其實圖上就看到很明顯的周期性了和使用率不同的峰值結構,當然也有一些測量的異常和系統故障點,總體來說還是很不錯的數據集
3.1 Statsmodel Seasonal decompose我們先用一個老的python的統計模型庫statsmodel,數據周期性分解如下:
from statsmodels.tsa.seasonal import seasonal_decomposedecomposition = seasonal_decompose(data['latency'],freq=48)data['seasonal']= decomposition.seasonaldata['trend'] = decomposition.trenddata['residual'] = decomposition.resid然後畫個圖:
chart2 =bokeh_multi_line_chart(data,['seasonal'],['周期性'],title='周期')chart3 =bokeh_multi_line_chart(data,['trend'],['延遲'],title='趨勢')chart4 =bokeh_multi_line_chart(data,['residual'],['殘差'],title='殘差')show_column([chart2,chart3,chart4])
可以看到趨勢線維持在平穩的66~67ms之間, 每天的周期性有點好玩,到了晚高峰有些抖動,主要是丟包引起的延遲測量問題,而且5月10日有明顯的異常, 這些異常可以捕獲殘差獲得並上報。
我們再來看丟包率的數據,周期性可以看到明顯的早高峰,下午工作段和晚高峰,而趨勢線可以明顯的看出節假日丟包率更高
3.2 Facebook Prophet預測未來值
這是一套開箱即用的工具,Prophet通過將全自動預測與在線學習相結合從而保證了該工具能夠解決大多數商業業務問題,Prophet工作流程如下圖所示:
然後fit一下:
from fbprophet import Prophetdata['ds'] = data['dateTime']data['y'] = data['latency']m = Prophet(changepoint_prior_scale=0.01).fit(data)future = m.make_future_dataframe(periods=96,freq='H')fcst = m.predict(future)fig = m.plot(fcst)
同樣預測丟包值:
學會了麼?AIOps就這麼簡單這麼容易,趕緊找點數據算算寫點報告給老闆們看吧,加薪了別忘了我就好~~嘿嘿,最後 Bokeh Wrapper代碼:
import tabulate as tabulateimport pandas as pdimport numpy as npimport bokeh.plottingimport bokeh.modelsimport bokeh.layoutsimport bokeh.palettes
bokeh.plotting.output_notebook()
def bokeh_multi_line_chart(df,item_list,legend_list,title,width=1900,height=600,legend_location='bottom_left',x_axis='dateTime',x_axis_type='datetime',y_axis_type='auto',line_width=1.5,alpha=0.7): fig = bokeh.plotting.figure(width=width,height=height,x_axis_type=x_axis_type , y_axis_type=y_axis_type ,title=title) lines_counter = len (item_list) if (lines_counter <= 3): color_list=['#d25535','#35b2d2','#98d235'] elif (lines_counter <=10): color_list=bokeh.palettes.Category10[10] else: color_list=bokeh.palettes.Category20[20]
for idx in range(0,lines_counter): item = item_list[idx] label = legend_list[idx] fig.line(df[x_axis],df[item],color=color_list[idx],legend=label,line_width=line_width,alpha=alpha) fig.legend.location = legend_location fig.legend.label_text_font_size = "0.8em" return fig
def bokeh_hbar_chart(df,categories_col,value_col,title,color='#B2D235',width=400,height=300): categories = list(df[categories_col]) categories.reverse() result_df = df[[categories_col,value_col]] source = bokeh.models.ColumnDataSource(result_df) fig = bokeh.plotting.figure(title=title, y_range=bokeh.models.FactorRange(factors=categories), width=width,height=height) fig.hbar(left=0, y=categories_col,right=value_col, color=color, source=source,height=0.3) return fig
def bokeh_vbar_chart(df,categories_col,value_col,title,color='#4F4478',width=600,height=380): rdf = df[[categories_col,value_col]] factors = list(rdf[categories_col]) fig = bokeh.plotting.figure(title=title, width=width,height=height,x_range=bokeh.models.FactorRange(*factors)) fig.vbar(bottom=0, top=rdf[value_col], x=factors , color=color, width=0.5, alpha=0.8) return fig
def bokeh_multi_hbar_chart(df,cat_col,value_list,width=400,height=300): chart_list=[] value_counter = len(value_list) if (value_counter <= 3): color_list=['#5154eb','#b2d235','#df9815'] elif (value_counter <=10): color_list=bokeh.palettes.Category10[10] else: color_list=bokeh.palettes.Category20[20] for idx in range(0,value_counter): value_name = value_list[idx] pfig = bokeh_hbar_chart(df,cat_col,value_name,value_name,color=color_list[idx], width=width,height=height) chart_list.append(pfig)
return chart_list
def bokeh_hist_chart(item_list,title,bins=100,width=400,height=300,legend_location='bottom_left'): fig = bokeh.plotting.figure(width=width,height=height,title=title) lines_counter = len (item_list) if (lines_counter <=3): color_list=['#036564','red','navy'] elif (lines_counter <=10): color_list=bokeh.palettes.Category10b[10] else: color_list=bokeh.palettes.Category20b[20]
for idx in range(0,lines_counter): hist,edges = np.histogram(item_list[idx], density=True, bins=bins) fig.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:],fill_color=color_list[idx], line_color="#033649",alpha=0.5) return fig
def show_grid(chart_list, num_in_row = 4): grid_render_idx = 0 grid_render_matrix = [] templist =[] for item in chart_list: templist.append(item) grid_render_idx +=1 if (grid_render_idx == num_in_row): grid_render_matrix.append(templist) templist =[] grid_render_idx =0 if (len(templist) >0 ): grid_render_matrix.append(templist)
bokeh.plotting.show(bokeh.layouts.gridplot(grid_render_matrix))
def show_column(chart_list): bokeh.plotting.show(bokeh.layouts.column(chart_list))
def show_row(chart_list): bokeh.plotting.show(bokeh.layouts.row(chart_list))