R_Graphics_cookbook_in_sage

4145 days ago by takepwave

連続値をカテゴリに変換する

import pandas as pd from ggplot import * 
       
age = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32] sex = ['F', 'M', 'M', 'M', 'F', 'M', 'F', 'M', 'F', 'M', 'F', 'M'] df = pd.DataFrame({'age': age, 'sex': sex}) df 
       
   age sex
0   20   F
1   22   M
2   25   M
3   27   M
4   21   F
5   23   M
6   37   F
7   31   M
8   61   F
9   45   M
10  41   F
11  32   M

[12 rows x 2 columns]
   age sex
0   20   F
1   22   M
2   25   M
3   27   M
4   21   F
5   23   M
6   37   F
7   31   M
8   61   F
9   45   M
10  41   F
11  32   M

[12 rows x 2 columns]
# カテゴリ分けする区切り値 bins = [18, 25, 35, 60, 100] cat_names = ['youth', 'YoungAdult', 'MiddleAged', 'Senior'] df['bins'] = pd.cut(df.age, bins, labels=cat_names) df.head() 
       
  age sex        bins
0  20   F       youth
1  22   M       youth
2  25   M       youth
3  27   M  YoungAdult
4  21   F       youth

[5 rows x 3 columns]
  age sex        bins
0  20   F       youth
1  22   M       youth
2  25   M       youth
3  27   M  YoungAdult
4  21   F       youth

[5 rows x 3 columns]
load(DATA + 'RUtil.py') 
       
#r("install.packages('ggplot2')") r('library(ggplot2)') 
       
[1] "ggplot2"   "stats"     "graphics"  "grDevices" "utils"    
"datasets"  "methods"   "base"     
[1] "ggplot2"   "stats"     "graphics"  "grDevices" "utils"     "datasets"  "methods"   "base"     
#r("install.packages('gcookbook')") r('library(gcookbook)') 
       
[1] "gcookbook" "ggplot2"   "stats"     "graphics"  "grDevices" "utils" 
"datasets"  "methods"  
[9] "base"     
[1] "gcookbook" "ggplot2"   "stats"     "graphics"  "grDevices" "utils"     "datasets"  "methods"  
[9] "base"     
# gcookbookのサンプルデータをRから取得する方法 dat = sageobj(r('heightweight'))['DATA'] # dat 
       
# DataFrameにfactorがあると上手くpandasに持って行けない # heightweightでは、sexがfactorとなっていた sex = dat['sex'] dat['sex'] = sex['DATA'] heightweight = pd.DataFrame(dat) 
       
# sex 1=f, 2=mをセットする heightweight.sex[heightweight.sex == 1] = 'f' heightweight.sex[heightweight.sex == 2] = 'm' heightweight.head() 
       
  ageMonth  ageYear  heightIn sex  weightLb
0      143    11.92      56.3   f      85.0
1      155    12.92      62.3   f     105.0
2      153    12.75      63.3   f     108.0
3      161    13.42      59.0   f      92.0
4      191    15.92      62.5   f     112.5

[5 rows x 5 columns]
  ageMonth  ageYear  heightIn sex  weightLb
0      143    11.92      56.3   f      85.0
1      155    12.92      62.3   f     105.0
2      153    12.75      63.3   f     108.0
3      161    13.42      59.0   f      92.0
4      191    15.92      62.5   f     112.5

[5 rows x 5 columns]
heightweight.tail() 
       
    ageMonth  ageYear  heightIn sex  weightLb
231      164    13.67      66.5   m     112.0
232      189    15.75      65.0   m     114.0
233      164    13.67      61.5   m     140.0
234      167    13.92      62.0   m     107.5
235      151    12.58      59.3   m      87.0

[5 rows x 5 columns]
    ageMonth  ageYear  heightIn sex  weightLb
231      164    13.67      66.5   m     112.0
232      189    15.75      65.0   m     114.0
233      164    13.67      61.5   m     140.0
234      167    13.92      62.0   m     107.5
235      151    12.58      59.3   m      87.0

[5 rows x 5 columns]
# Rec.2.1 散布図を作成する ggplot(mtcars, aes(x='wt', y='mpg')) + geom_point() 
       
<ggplot: (33353505)>
<ggplot: (33353505)>
ggsave('Rec.2.1.png', dpi=50) 
       
Saving 11.0 x 8.0 in image.
Saving 11.0 x 8.0 in image.
# Rec.2.2 折れ線グラフを作成する pressure = pd.DataFrame(sageobj(r('pressure'))['DATA']) pressure.head() 
       
   pressure  temperature
0    0.0002            0
1    0.0012           20
2    0.0060           40
3    0.0300           60
4    0.0900           80

[5 rows x 2 columns]
   pressure  temperature
0    0.0002            0
1    0.0012           20
2    0.0060           40
3    0.0300           60
4    0.0900           80

[5 rows x 2 columns]
ggplot(pressure, aes(x='temperature', y='pressure')) +geom_line() + geom_point() 
       
<ggplot: (33424753)>
<ggplot: (33424753)>
ggsave('Rec.2.2.png', dpi=50) 
       
Saving 11.0 x 8.0 in image.
Saving 11.0 x 8.0 in image.
# Rec.2.3 棒グラフを作成する # cylは連続値なので、factorで離散として扱う ggplot(mtcars, aes(x='factor(cyl)')) +geom_bar() 
       
<ggplot: (33602325)>
<ggplot: (33602325)>
ggsave('Rec.2.3.png', dpi=50) 
       
Saving 11.0 x 8.0 in image.
Saving 11.0 x 8.0 in image.
# Rec.2.4 ヒストグラムを作成する ggplot(mtcars, aes(x='mpg')) + geom_histogram(binwidth='4') 
       
<ggplot: (33826237)>
<ggplot: (33826237)>
ggsave('Rec.2.4.png', dpi=50) 
       
Saving 11.0 x 8.0 in image.
Saving 11.0 x 8.0 in image.
# Rの結果と異なる! graph = preGraph("fig2.4.pdf") r('p <- ggplot(mtcars, aes(x=mpg)) + geom_histogram(binwidth=4)') r('plot(p)') postGraph(graph) 
       
# ToothGrowthデータをRから持ってくる dat = sageobj(r('ToothGrowth'))['DATA'] # suppをfactorから文字列に変換 dat['supp'] = dat['supp']['DATA'] ToothGrowth = pd.DataFrame(dat) ToothGrowth.supp[ToothGrowth.supp == 1] = 'OJ' ToothGrowth.supp[ToothGrowth.supp == 2] = 'VC' 
       
# Rec.2.5 箱ひげ図を作成する # geom_boxplotはまだ実装されていないみたい # ggplot(ToothGrowth, aes(x='interaction(supp, dose)', y='len')) + geom_boxplot() 
       
#ggsave('Rec.2.4.png', dpi=50) 
       
graph = preGraph("fig2.5.pdf") r('p <- ggplot(ToothGrowth, aes(x=interaction(supp, dose), y=len)) + geom_boxplot()') r('plot(p)') postGraph(graph) 
       
# Rec.2.6 関数曲線をプロットする # stat_functionはまだ実装されていないみたい graph = preGraph("fig2.6.pdf") r('myfun <- function(xvar){ 1/(1 + exp(-xvar + 10)) }') r('p <- ggplot(data.frame(x=c(0, 20)), aes(x=x)) + stat_function(fun=myfun, geom="line")') r('plot(p)') postGraph(graph) 
       
# Rec.3.1 棒グラフを作成する pg_mean = pd.DataFrame({'group':['ctrl', 'trt1', 'trt2'], 'weight': [5.032, 4.661, 5.526]}) pg_mean.head() 
       
  group            weight
0  ctrl  5.03200000000000
1  trt1  4.66100000000000
2  trt2  5.52600000000000

[3 rows x 2 columns]
  group            weight
0  ctrl  5.03200000000000
1  trt1  4.66100000000000
2  trt2  5.52600000000000

[3 rows x 2 columns]
ggplot(pg_mean, aes(x='group', weight='weight')) + geom_bar() 
       
<ggplot: (33824705)>
<ggplot: (33824705)>
ggsave('Rec.3.1.png', dpi=50) 
       
Saving 11.0 x 8.0 in image.
Saving 11.0 x 8.0 in image.
# R版と指定方法が異なるので graph = preGraph("fig3.1.pdf") r('p <- ggplot(pg_mean, aes(x=group, y=weight)) + geom_bar(stat="identity")') r('plot(p)') postGraph(graph) 
       
# Fig3-2 BOD = pd.DataFrame(sageobj(r('BOD'))['DATA']) BOD.head() 
       
   Time  demand
0     1     8.3
1     2    10.3
2     3    19.0
3     4    16.0
4     5    15.6

[5 rows x 2 columns]
   Time  demand
0     1     8.3
1     2    10.3
2     3    19.0
3     4    16.0
4     5    15.6

[5 rows x 2 columns]
# ggplotの場合、factor(Time)のようにプロットされる ggplot(BOD, aes(x='Time', weight='demand')) + geom_bar(stat="identity") 
       
<ggplot: (33963725)>
<ggplot: (33963725)>
ggsave('fig.3.2a.png', dpi=50) 
       
Saving 11.0 x 8.0 in image.
Saving 11.0 x 8.0 in image.
ggplot(BOD, aes(x='factor(Time)', weight='demand')) + geom_bar(stat="identity") 
       
<ggplot: (34100509)>
<ggplot: (34100509)>
ggsave('fig.3.2b.png', dpi=50) 
       
Saving 11.0 x 8.0 in image.
Saving 11.0 x 8.0 in image.
# Fig3-3 ggplot(pg_mean, aes(x='group', weight='weight')) + geom_bar(stat="identity", fill="lightblue", colour="black") 
       
<ggplot: (34221493)>
<ggplot: (34221493)>
ggsave('fig.3.3.png', dpi=50) 
       
Saving 11.0 x 8.0 in image.
Saving 11.0 x 8.0 in image.
# Rec.3.2 棒をグループ化する cabbage_exp = pd.DataFrame({'Cultivar': ['c39', 'c39', 'c39', 'c52', 'c52', 'c52'], 'Date': ['d16', 'd20', 'd21', 'd16', 'd20', 'd21'], 'Weight': [3.18, 2.8, 2.74, 2.26, 3.11, 1.47]}) cabbage_exp 
       
  Cultivar Date            Weight
0      c39  d16  3.18000000000000
1      c39  d20  2.80000000000000
2      c39  d21  2.74000000000000
3      c52  d16  2.26000000000000
4      c52  d20  3.11000000000000
5      c52  d21  1.47000000000000

[6 rows x 3 columns]
  Cultivar Date            Weight
0      c39  d16  3.18000000000000
1      c39  d20  2.80000000000000
2      c39  d21  2.74000000000000
3      c52  d16  2.26000000000000
4      c52  d20  3.11000000000000
5      c52  d21  1.47000000000000

[6 rows x 3 columns]
# 横並びができない。d20の積み重ねの色が変? #ggplot(cabbage_exp, aes(x='factor(Date)', weight='Weight', colour='Cultivar')) + geom_bar(position='dodge') ggplot(cabbage_exp, aes(x='factor(Date)', weight='Weight', colour='Cultivar')) + geom_bar() 
       
<ggplot: (34220393)>
<ggplot: (34220393)>
ggsave('Rec.3.2.png', dpi=50) 
       
Saving 11.0 x 8.0 in image.
Saving 11.0 x 8.0 in image.
# Rec.3.2 棒をグループ化 graph = preGraph("fig3.4.pdf") r('p <- ggplot(cabbage_exp, aes(x=Date, y=Weight, fill=Cultivar)) + geom_bar(position="dodge")') r('plot(p)') postGraph(graph) 
       
# Rec.3.3 個数を示す棒グラフを作成する ggplot(diamonds, aes(x='cut')) + geom_bar() 
       
<ggplot: (33963709)>
<ggplot: (33963709)>
ggsave('Rec.3.3.png', dpi=50) 
       
Saving 11.0 x 8.0 in image.
Saving 11.0 x 8.0 in image.
# Rec.3.4 色つきの棒グラフを作成する r('upc <- subset(uspopchange, rank(Change)>40)') graph = preGraph("Rec.3.4.pdf") r('p <- ggplot(upc, aes(x=Abb, y=Change, fill=Region)) + geom_bar(stat="identity")') r('plot(p)') postGraph(graph) 
       
# Rec.3.5 棒の正負によって色を塗り分ける # 値が正か負を示すpos列をデータフレームに追加する r('csub <- subset(climate, Source=="Berkeley" & Year >= 1900)') r('csub$pos <- csub$Anomaly10y >= 0') r('head(csub)') 
       
      Source Year Anomaly1y Anomaly5y Anomaly10y Unc10y   pos
101 Berkeley 1900        NA        NA     -0.171  0.108 FALSE
102 Berkeley 1901        NA        NA     -0.162  0.109 FALSE
103 Berkeley 1902        NA        NA     -0.177  0.108 FALSE
104 Berkeley 1903        NA        NA     -0.199  0.104 FALSE
105 Berkeley 1904        NA        NA     -0.223  0.105 FALSE
106 Berkeley 1905        NA        NA     -0.241  0.107 FALSE
      Source Year Anomaly1y Anomaly5y Anomaly10y Unc10y   pos
101 Berkeley 1900        NA        NA     -0.171  0.108 FALSE
102 Berkeley 1901        NA        NA     -0.162  0.109 FALSE
103 Berkeley 1902        NA        NA     -0.177  0.108 FALSE
104 Berkeley 1903        NA        NA     -0.199  0.104 FALSE
105 Berkeley 1904        NA        NA     -0.223  0.105 FALSE
106 Berkeley 1905        NA        NA     -0.241  0.107 FALSE
graph = preGraph("Rec.3.5.pdf") r('p <- ggplot(csub, aes(x=Year, y=Anomaly10y, fill=pos)) + geom_bar(stat="identity", position="identity")') r('plot(p)') postGraph(graph) 
       
# Rec.3.6 棒の幅と間隔を調整する # 最大の幅1.0 # 指定が効かない ggplot(pg_mean, aes(x='group', weight='weight')) + geom_bar(stat="identity", width='1.0') 
       
<ggplot: (33825729)>
<ggplot: (33825729)>
ggsave('Rec.3.6.png', dpi=50) 
       
Saving 11.0 x 8.0 in image.
Saving 11.0 x 8.0 in image.
# Rec.3.7 積み上げ棒グラフを作成する graph = preGraph("Rec.3.7.pdf") r('p <- ggplot(cabbage_exp, aes(x=Date, y=Weight, fill=Cultivar)) + geom_bar(stat="identity")') r('plot(p)') postGraph(graph) 
       
# Rec.3.8 100%積み上げ棒グラフ(Practical Data Science版) graph = preGraph("Rec.3.8.pdf") r('p <- ggplot(cabbage_exp) + geom_bar(aes(x=Date, y=Weight, fill=Cultivar), position="fill")') r('plot(p)') postGraph(graph) 
       
# Rec.3.9 棒グラフにラベルを追加する vjustでラベルの位置を調整 # python版はダメ # ggplot(cabbage_exp, aes(x='factor(Date)', weight='Weight')) + geom_bar() + geom_text(aes(y='Weight', label='Weight')) graph = preGraph("Rec.3.9.pdf") r('p <- ggplot(cabbage_exp, aes(x=interaction(Date, Cultivar) , y=Weight)) + geom_bar(stat="identity")+ geom_text(aes(label=Weight, vjust=1.5, colour="white"))') r('plot(p)') postGraph(graph) 
       
#ggsave('Rec.3.9.png', dpi=50) 
       
# geom_textは実装されている ggplot(aes(x='wt', y='mpg', label='name'), data=mtcars) + \ geom_text() 
       
<ggplot: (34355773)>
<ggplot: (34355773)>
ggsave('test1.0.png', dpi=50) 
       
Saving 11.0 x 8.0 in image.
Saving 11.0 x 8.0 in image.