#單個項目數據分析
from scipy.stats import chi2 from scipy.stats import norm from scipy.stats import t from scipy.stats import f import matplotlib.pyplot as pltimport numpy as npimport pandas as pdimport scipy.stats as statsfrom scipy.stats import chi2_contingency plt.figure()train = pd.read_csv("test.csv")train_Age = train.dropna(subset=['p1'])M_S = stats.norm.fit(train_Age['p1']) plt.hist(train_Age['p1'],bins=30, normed=1, facecolor='blue', alpha=0.5) train_Age['p1'].plot(kind='kde',secondary_y=True) normalDistribution = stats.norm(M_S[0], M_S[1]) x = np.linspace(normalDistribution.ppf(0.01), normalDistribution.ppf(0.99), 100)plt.plot(x, normalDistribution.pdf(x), c='orange')plt.xlabel('reqirement time')plt.title('reqirement time on NormalDistribution', size=20)plt.legend(['Origin', 'NormDistribution'])from scipy import integratefor n in range(0,400,1): x=np.linspace(0,n,1000) y=normalDistribution.pdf(x) p=integrate.trapz(y, x) if p>0.8: print (n) break
#多個項目批量輸出分析結果
from scipy.stats import chi2 from scipy.stats import norm from scipy.stats import t from scipy.stats import f import matplotlib.pyplot as pltimport numpy as npimport pandas as pdimport scipy.stats as statsfrom scipy.stats import chi2_contingency from scipy import integratefrom scipy.stats import kstest from statsmodels.stats.diagnostic import lilliforstrain = pd.read_csv("test.csv")fig = plt.figure(figsize=(16, 16))projects_index = ['p1', 'p2', 'p3', 'p4','p5', 'p6', 'p7', 'p8','p9', 'p10', 'p11', 'p12', 'p14', 'p15', 'p16', 'p17', 'p18','p19', 'p20', 'p21', 'p22', 'p23', 'p24','p25', 'p26', 'p27', 'p28','p29', 'p30', 'p31', 'p32', 'p33', 'p34','p35', 'p36', 'p38', 'p40', 'p41', 'p42', 'p43', 'p45', 'p46' ]count=0Req_Leadtime = pd.DataFrame(columns=('項目名稱','有效需求總數','平均值','中位數','眾數','標準差','p值','需求交付周期閾值')) for index in projects_index: count+=1 print(index) train_time = train.dropna(subset=[index]) M_S = stats.norm.fit(train_time[index]) plt.hist(train_time[index],bins=30, normed=1, facecolor='blue', alpha=0.5) train_time[index].plot(kind='kde',secondary_y=True) normalDistribution = stats.norm(M_S[0], M_S[1]) x = np.linspace(normalDistribution.ppf(0.01), normalDistribution.ppf(0.99), 100) plt.plot(x, normalDistribution.pdf(x), c='orange') avg_leadtime = np.mean(train_time[index]) med_leadtime = np.median(train_time[index]) mode_leadtime = stats.mode(train_time[index])[0][0] std_leadtime = np.std(train_time[index], ddof=1) if len(train_time[index])<8: p = -1 else: if len(train_time[index])<50: q,p = stats.normaltest(train_time[index]) else: if 50<=(len(train_time[index]))<=300: q,p = kstest(train_time[index], 'norm') else: q,p = kstest(train_time[index], 'norm') print(p) for n in range(0,400,1): x=np.linspace(0,n,1000) y=normalDistribution.pdf(x) h=integrate.trapz(y, x) if h>0.8: print (n) break valid_req_counts = len(train_time[index]) req_leadtime_avg = avg_leadtime req_leadtime_med = med_leadtime req_leadtime_mode = mode_leadtime req_leadtime_std = std_leadtime req_leadtime_p = p req_leadtime_ref = n Req_Leadtime = Req_Leadtime.append(pd.DataFrame({'項目名稱':[index], '有效需求總數':[valid_req_counts], '平均值':[req_leadtime_avg], '中位數':[req_leadtime_med], '眾數':[req_leadtime_mode], '標準差':[req_leadtime_std], 'p值':[req_leadtime_p], '需求交付周期閾值':[req_leadtime_ref]}),ignore_index=True) print(req_leadtime_p,p) plt.xlabel('Req leadtime') plt.title("(%s) Req_leadtime on NormalDistribution, p = %10.3 f, Req_leadtime_req = %d" %(index,p,n),fontsize=20) plt.legend(['Origin', 'NormDistribution']) plt.savefig("image/'" + index + "'.png") plt.clf()print(Req_Leadtime)Req_Leadtime.to_excel('Req_Leadtime.xlsx', encoding='utf-8', index=True, header=True)