Hiroshi TAKEMOTO (take@pwv.co.jp)

入門機械学習による異常検出

井出　剛著の「入門機械学習による異常検出」(以降、井出本と記す)の例題をSageを使ってお復習いします。

3章　非正規データからの異常検知

井出本の基本は、データに対するモデルを使って負の対数尤度を求め、それを異常検出関数として使うことです。

分布が左右対称でない場合

Rのcarパッケージに入っているデータDavisを使って、多次元データの異常度を求めてみましょう。

準備

いつものように必要なパッケージを読込ます。

# RとPandasで必要なユーティリティ # Rの必要なライブラリ r('library(ggplot2)') r('library(jsonlite)') # RUtilにRとPandasのデータフレームを相互に変換する関数を追加+GgSaveを追加（2015/07/12） load(DATA + 'RUtil.py')

# 2.2 carパッケージのDavisを使って例題を試す # carパッケージのインストールには、時間が掛かりますので注意してください。 r('library(car)') r('library(MASS)')

 [1] "MASS"      "car"       "jsonlite"  "ggplot2"   "stats"    
"graphics"  "grDevices" "utils"    
 [9] "datasets"  "methods"   "base"

 [1] "MASS"      "car"       "jsonlite"  "ggplot2"   "stats"     "graphics"  "grDevices" "utils"    
 [9] "datasets"  "methods"   "base"

データの性格を知る

Rのデータをpandaのデータフレームに変換して、データの性格を調べます。最初にinfoとdescribeで個数や欠損値、平均、分散などの統計情報を求めます。

# Rのデータをpandaのデータフレームに変換する davis = RDf2PandaDf("Davis")

次にDavisの体重の分布と体重（weight）と身長（height）の散布図をプロットし、おおざっぱな性質を調べます。

# weightの分布をみる # 1個だけ、160以上の値を示している ggplot(davis, aes(x='weight')) + geom_histogram(binwidth=5,color="grey")

<ggplot: (33321117)>

<ggplot: (33321117)>

GgSave('fig_2.1.png', fac=0.8)

Saving 11.0 x 8.0 in image.

Saving 11.0 x 8.0 in image.

# ガンマ分布の異常検出 N = len(davis.weight) mu = davis.mean()['weight'] si = davis.std()['weight']*(N-1)/N kmo = (mu/si)^2 smo = si^2/mu print kmo, smo

19.1928236809 3.42836474164

19.1928236809 3.42836474164

r('ml <- fitdistr(Davis$weight, "gamma")')

     shape         rate   
  22.4854793    0.3417247 
 ( 2.2317648) ( 0.0342978)

     shape         rate   
  22.4854793    0.3417247 
 ( 2.2317648) ( 0.0342978)

kml = r('ml$estimate["shape"]') sml = 1/r('ml$estimate["rate"]') print kml print sml

   shape 
22.48548 
    rate 
2.926333

   shape 
22.48548 
    rate 
2.926333

# SciPyのstatsを使ってガンマ分布の最尤推定値を求める from scipy import stats fit_alpha, fit_loc, fit_beta = stats.gamma.fit(np.array(davis.weight)) print(fit_alpha, fit_loc, fit_beta)

(0.97822474410373339, 50.907615570930275, 15.223888496824298)

(0.97822474410373339, 50.907615570930275, 15.223888496824298)

import scipy.stats as stats # scipyのfitの性能を比べてみるN=1000, N=200での結果 alpha = 5 loc = 0 beta = 22 data = stats.gamma.rvs(alpha, loc=loc, scale=beta, size=1000) fit_alpha, fit_loc, fit_beta=stats.gamma.fit(data) print(fit_alpha, fit_loc, fit_beta)

(4.8646839809590539, 2.0908689871516435, 22.073472385903131)

(4.8646839809590539, 2.0908689871516435, 22.073472385903131)

r(list(data)).name("data") r('ml <- fitdistr(data, "gamma")')

      shape         rate    
  5.089240153   0.046489267 
 (0.220217259) (0.002113778)

      shape         rate    
  5.089240153   0.046489267 
 (0.220217259) (0.002113778)

r('1/ml$estimate["rate"]')

    rate 
21.51034

    rate 
21.51034

data = stats.gamma.rvs(alpha, loc=loc, scale=beta, size=200) fit_alpha, fit_loc, fit_beta=stats.gamma.fit(data) print(fit_alpha, fit_loc, fit_beta)

(3.221810420796253, 21.093760783989296, 26.987507381821338)

(3.221810420796253, 21.093760783989296, 26.987507381821338)

# Rのfitdistrも少ないデータだとrateの計算があまり良くない r(list(data)).name("data") r('ml <- fitdistr(data, "gamma")')

      shape         rate    
  5.176892021   0.047915393 
 (0.501187681) (0.004870251)

      shape         rate    
  5.176892021   0.047915393 
 (0.501187681) (0.004870251)

r('1/ml$estimate["rate"]')

    rate 
20.87012

    rate 
20.87012

# bar_chartでヒストグラムの代わりにするという記事があったので、Rで計算したhistgramの結果を # bar_chartで表示してみる。 r('h <- hist(Davis$weight)') breaks = sageobj(r('h$breaks')) counts = sageobj(r('h$counts')) breaks # 横の刻みの値（下図の0~8の値に相当）

[20, 40, 60, 80, 100, 120, 140, 160, 180]

[20, 40, 60, 80, 100, 120, 140, 160, 180]

bar_chart(counts)

samp = np.array(davis.weight) fit_alpha, fit_loc, fit_beta = stats.gamma.fit(samp) print fit_alpha, fit_loc, fit_beta x = np.linspace(20,180,200) plt.figure() # fitted distribution pdf_fitted = stats.gamma.pdf(x, fit_alpha, loc=fit_loc, scale=fit_beta) plt.title('Gamma distribution') plt.plot(x,pdf_fitted,'r-') plt.hist(samp,bins=24, normed=1,alpha=.3) GgSave("5.png", 0.7)

0.978224744104 50.9076155709 15.2238884968
Saving 8.0 x 6.0 in image.

0.978224744104 50.9076155709 15.2238884968
Saving 8.0 x 6.0 in image.

fit_alpha = 22.48548 fit_loc = 0 fit_beta = 2.926333 x = np.linspace(20,180,200) plt.figure() # fitted distribution pdf_fitted = stats.gamma.pdf(x, fit_alpha, loc=fit_loc, scale=fit_beta) plt.title('Gamma distribution') plt.plot(x,pdf_fitted,'r-') plt.hist(samp,bins=24, normed=1,alpha=.3) GgSave("6.png", 0.7)

Saving 8.0 x 6.0 in image.

Saving 8.0 x 6.0 in image.

# Author: Jake Vanderplas <jakevdp@cs.washington.edu> # import numpy as np import matplotlib.pyplot as plt from scipy.stats import norm from sklearn.neighbors import KernelDensity #---------------------------------------------------------------------- # Plot the progression of histograms to kernels #np.random.seed(1) N = 20 X = np.concatenate((np.random.normal(0, 1, 0.3 * N), np.random.normal(5, 1, 0.7 * N)))[:, np.newaxis] X_plot = np.linspace(-5, 10, 1000)[:, np.newaxis] bins = np.linspace(-5, 10, 10) fig, ax = plt.subplots(2, 2, sharex=True, sharey=True) fig.subplots_adjust(hspace=0.05, wspace=0.05) # histogram 1 ax[0, 0].hist(X[:, 0], bins=bins, fc='#AAAAFF', normed=True) ax[0, 0].text(-3.5, 0.31, "Histogram") # histogram 2 ax[0, 1].hist(X[:, 0], bins=bins + 0.75, fc='#AAAAFF', normed=True) ax[0, 1].text(-3.5, 0.31, "Histogram, bins shifted") # tophat KDE kde = KernelDensity(kernel='tophat', bandwidth=0.75).fit(X) log_dens = kde.score_samples(X_plot) ax[1, 0].fill(X_plot[:, 0], np.exp(log_dens), fc='#AAAAFF') ax[1, 0].text(-3.5, 0.31, "Tophat Kernel Density") # Gaussian KDE kde = KernelDensity(kernel='gaussian', bandwidth=0.75).fit(X) log_dens = kde.score_samples(X_plot) ax[1, 1].fill(X_plot[:, 0], np.exp(log_dens), fc='#AAAAFF') ax[1, 1].text(-3.5, 0.31, "Gaussian Kernel Density") for axi in ax.ravel(): axi.plot(X[:, 0], np.zeros(X.shape[0]) - 0.01, '+k') axi.set_xlim(-4, 9) axi.set_ylim(-0.02, 0.34) for axi in ax[:, 0]: axi.set_ylabel('Normalized Density') for axi in ax[1, :]: axi.set_xlabel('x') GgSave("1.png", 1.0) #---------------------------------------------------------------------- # Plot all available kernels X_plot = np.linspace(-6, 6, 1000)[:, None] X_src = np.zeros((1, 1)) fig, ax = plt.subplots(2, 3, sharex=True, sharey=True) fig.subplots_adjust(left=0.05, right=0.95, hspace=0.05, wspace=0.05) def format_func(x, loc): if x == 0: return '0' elif x == 1: return 'h' elif x == -1: return '-h' else: return '%ih' % x for i, kernel in enumerate(['gaussian', 'tophat', 'epanechnikov', 'exponential', 'linear', 'cosine']): axi = ax.ravel()[i] log_dens = KernelDensity(kernel=kernel).fit(X_src).score_samples(X_plot) axi.fill(X_plot[:, 0], np.exp(log_dens), '-k', fc='#AAAAFF') axi.text(-2.6, 0.95, kernel) axi.xaxis.set_major_formatter(plt.FuncFormatter(format_func)) axi.xaxis.set_major_locator(plt.MultipleLocator(1)) axi.yaxis.set_major_locator(plt.NullLocator()) axi.set_ylim(0, 1.05) axi.set_xlim(-2.9, 2.9) ax[0, 1].set_title('Available Kernels') GgSave("2.png", 1.0) #---------------------------------------------------------------------- # Plot a 1D density example N = 100 #np.random.seed(1) X = np.concatenate((np.random.normal(0, 1, 0.3 * N), np.random.normal(5, 1, 0.7 * N)))[:, np.newaxis] X_plot = np.linspace(-5, 10, 1000)[:, np.newaxis] true_dens = (0.3 * norm(0, 1).pdf(X_plot[:, 0]) + 0.7 * norm(5, 1).pdf(X_plot[:, 0])) fig, ax = plt.subplots() ax.fill(X_plot[:, 0], true_dens, fc='black', alpha=0.2, label='input distribution') for kernel in ['gaussian', 'tophat', 'epanechnikov']: kde = KernelDensity(kernel=kernel, bandwidth=0.5).fit(X) log_dens = kde.score_samples(X_plot) ax.plot(X_plot[:, 0], np.exp(log_dens), '-', label="kernel = '{0}'".format(kernel)) ax.text(6, 0.38, "N={0} points".format(N)) ax.legend(loc='upper left') ax.plot(X[:, 0], -0.005 - 0.01 * np.random.random(X.shape[0]), '+k') ax.set_xlim(-4, 9) ax.set_ylim(-0.02, 0.4) #plt.show() GgSave("3.png", 0.7)

Saving 8.0 x 6.0 in image.

Saving 8.0 x 6.0 in image.

Saving 8.0 x 6.0 in image.

Saving 8.0 x 6.0 in image.

Saving 8.0 x 6.0 in image.

Saving 8.0 x 6.0 in image.

chap3_Introduction_to_Anomaly_Detection_using_Machine_Learning

3607 days ago by takepwave

入門機械学習による異常検出

3章　非正規データからの異常検知

分布が左右対称でない場合

準備

データの性格を知る

chap3_Introduction_to_Anomaly_Detection_using_Machine_Learning

3607 days ago by takepwave

入門機械学習による異常検出

3章 非正規データからの異常検知

分布が左右対称でない場合

準備

データの性格を知る

3章　非正規データからの異常検知