更新時間:2019年11月08日11時57分 來源:傳智播客 瀏覽次數(shù):
#TODO:用戶流失預警 #1.導包 from __future__ import division import pandas as pd import numpy as np #2.加載數(shù)據(jù)與觀察數(shù)據(jù) churn_df = pd.read_csv('UserDrain_data/churn.csv') col_names = churn_df.columns.tolist() # print("Column names:\n",col_names) # print(churn_df.shape) #(3333, 21) # Column names:['State', 'Account Length', 'Area Code', 'Phone', "Int'l Plan", 'VMail Plan', 'VMail Message', 'Day Mins', 'Day Calls', 'Day Charge', 'Eve Mins', 'Eve Calls', 'Eve Charge', 'Night Mins', 'Night Calls', 'Night Charge', 'Intl Mins', 'Intl Calls', 'Intl Charge', 'CustServ Calls', 'Churn?'] to_show = col_names[:6] + col_names[-6:] #前6 列與后6 列 # print(len(to_show))#共12 列 # print ("\nSample data:\n",churn_df[to_show].head(6)) # State Account Length Area Code ... Intl Charge CustServ Calls Churn? # 0 KS 128 415 ... 2.70 1 False. # 1 OH 107 415 ... 3.70 1 False. #2.1 類別編碼 churn_result = churn_df['Churn?'] y = np.where(churn_result == 'True.',1,0) #2.2 刪除不需要對應列數(shù)據(jù) to_drop = ['State','Area Code','Phone','Churn?'] churn_feat_space = churn_df.drop(to_drop,axis=1) #yes 或者no 需要轉化為布爾類型數(shù)據(jù) yes_no_cols = ["Int'l Plan","VMail Plan"] churn_feat_space[yes_no_cols] = churn_feat_space[yes_no_cols] == 'yes' # print(churn_feat_space[yes_no_cols]) # Int'l Plan VMail Plan # 0 False True # 1 False True # 獲取數(shù)據(jù)屬性名 features = churn_feat_space.columns # print("churn_feat_space:\n",churn_feat_space.head()) # Account Length Int'l Plan ... Intl Charge CustServ Calls # 0 128 False ... 2.7 1 # 1 107 False ... 3.7 1 #將dataframe 轉化為ndarray 數(shù)組,同時數(shù)組中的元素類型為float 類型 #對應的布爾類型的值,True 為1,False 為0 X = churn_feat_space.as_matrix().astype(np.float) np.set_printoptions(threshold=np.NaN) # print("churn_feat_space.as_matrix().astype(np.float):\n",X) # [[128. 0. 1.... 3. 2.7 1.] # [107. 0. 1.... 3. 3.7 1.] # 2.3 數(shù)據(jù)標準化 from sklearn.preprocessing import StandardScaler scaler = StandardScaler() X = scaler.fit_transform(X) # print("Feature space holds %d observations and %d features" % X.shape) #3333 行17 列 # Feature space holds 3333 observations and 17 features # print("Unique target labels:", np.unique(y))# [0 1] # print(X[0]) #標準化之后的第一行數(shù)據(jù) #[ 0.67648946 -0.32758048 1.6170861 1.23488274 1.56676695 0.47664315 # 1.56703625 -0.07060962 -0.05594035 -0.07042665 0.86674322 -0.46549436 # 0.86602851 -0.08500823 -0.60119509 -0.0856905 -0.42793202] # print(len(y[y == 0])) #2850 #3.KFold K 折交叉驗證 from sklearn.cross_validation import KFold def run_cv(X,y,clf_class,**kwargs): #創(chuàng)建kfold 對象 kf=KFold(len(y),n_folds=5,shuffle=True) y_pred=y.copy() #迭代 count=0 for train_index,test_index in kf: count=count+1 #y 對應的標簽數(shù)量:3333 # print("train_index 數(shù)量:",len(train_index)) #train_index 數(shù)量: 2666 # print("test_index 數(shù)量:", len(test_index)) # test_index 數(shù)量: 667 # print(test_index) X_train,X_test=X[train_index],X[test_index] y_train=y[train_index] #初始化一個分類器模型 clf=clf_class(**kwargs) clf.fit(X_train,y_train) y_pred[test_index] = clf.predict(X_test) # print("迭代次樹:", count)#5 return y_pred #4.建模 from sklearn.svm import SVC from sklearn.ensemble import RandomForestClassifier as RF from sklearn.neighbors import KNeighborsClassifier as KNN def accuracy(y_true,y_pred): # NumPy interprets True and False as 1. and 0. return np.mean(y_true == y_pred) #4.1 SVM / RF /KNN 三種算法預測準確率 print("Support vector machines:","%.3f" % accuracy(y, run_cv(X,y,SVC))) print("Random forest:","%.3f" % accuracy(y, run_cv(X,y,RF))) print("K-nearest-neighbors:","%.3f" % accuracy(y,run_cv(X,y,KNN)))