配列の操作・統計
配列の操作
行をずらす
1
2
3
4
5
6
7
8
9
|
>>> pd.Series([1,3,5,np.nan,6,8], index=dates).shift(2)
>>> s
2013-01-01 NaN
2013-01-02 NaN
2013-01-03 1.0
2013-01-04 3.0
2013-01-05 5.0
2013-01-06 NaN
Freq: D, dtype: float64
|
各行への減算
1
2
3
4
5
6
7
8
|
>>> df.sub(s, axis='index')
A B C D F
2013-01-01 NaN NaN NaN NaN NaN
2013-01-02 NaN NaN NaN NaN NaN
2013-01-03 -1.861849 -3.104569 -1.494929 4.0 1.0
2013-01-04 -2.278445 -3.706771 -4.039575 2.0 0.0
2013-01-05 -5.424972 -4.432980 -4.723768 0.0 -1.0
2013-01-06 NaN NaN NaN NaN NaN
|
関数の適用
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
|
>>> df.apply(np.cumsum) #累計
A B C D F
2013-01-01 0.000000 0.000000 -1.509059 5 NaN
2013-01-02 1.212112 -0.173215 -1.389850 10 1.0
2013-01-03 0.350263 -2.277784 -1.884779 15 3.0
2013-01-04 1.071818 -2.984555 -2.924354 20 6.0
2013-01-05 0.646846 -2.417535 -2.648122 25 10.0
2013-01-06 -0.026844 -2.303886 -4.126549 30 15.0
>>> df.apply(lambda x: x.max() - x.min()) #行の最大値-最小値
A 2.073961
B 2.671590
C 1.785291
D 0.000000
F 4.000000
dtype: float64
|
ヒストグラム
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
|
>>> s = pd.Series(np.random.randint(0, 7, size=10))
>>> s
0 4
1 2
2 1
3 2
4 6
5 4
6 4
7 6
8 4
9 4
dtype: int64
>>> s.value_counts()
4 5
6 2
2 2
1 1
dtype: int64
|
配列のマージ
1
2
3
4
5
6
7
8
9
10
11
12
13
14
|
>>> left = pd.DataFrame({'key': ['foo', 'bar'], 'lval': [1, 2]})
>>> right = pd.DataFrame({'key': ['foo', 'bar'], 'rval': [4, 5]})
>>> left
key lval
0 foo 1
1 bar 2
>>> right
key rval
0 foo 4
1 bar 5
>>> pd.merge(left, right, on='key')
key lval rval
0 foo 1 4
1 bar 2 5
|
データのグループ化
データの集計
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
|
>>> df = pd.DataFrame({
'A' : ['foo', 'bar', 'foo', 'bar','foo', 'bar', 'foo', 'foo'],
'B' : ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
'C' : np.random.randn(8),
'D' : np.random.randn(8)})
>>> df
A B C D
0 foo one -1.202872 -0.055224
1 bar one -1.814470 2.395985
2 foo two 1.018601 1.552825
3 bar three -0.595447 0.166599
4 foo two 1.395433 0.047609
5 bar two -0.392670 -0.136473
6 foo one 0.007207 -0.561757
7 foo three 1.928123 -1.623033
#ここまでデータ準備
>>> df.groupby('A').sum() #A列の文字列で数値をグループ化
C D
A
bar -2.802588 2.42611
foo 3.146492 -0.63958
>>> df.groupby(['A','B']).sum() #A、B列の両方の文字列で数値をグループ化
C D
A B
bar one -1.814470 2.395985
three -0.595447 0.166599
two -0.392670 -0.136473
foo one -1.195665 -0.616981
three 1.928123 -1.623033
two 2.414034 1.600434
|
集計配列の変形
pd.MultiIndex.from_tuplesでindexの結合が行われ視認性が高まります。また、stack()を使うことにより様々な表示方法が可能になります。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
|
>>> tuples = list(zip(*[
['bar', 'bar', 'baz', 'baz','foo', 'foo', 'qux', 'qux'],
['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']
]))
>>> index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second'])
>>> df = pd.DataFrame(np.random.randn(8, 2), index=index, columns=['A', 'B'])
>>> df2 = df[:4]
>>> df2
A B
first second
bar one 0.029399 -0.542108
two 0.282696 -0.087302
baz one -1.575170 1.771208
two 0.816482 1.100230
>>> stacked = df2.stack()
>>> stacked
first second
bar one A 0.029399
B -0.542108
two A 0.282696
B -0.087302
baz one A -1.575170
B 1.771208
two A 0.816482
B 1.100230
dtype: float64
>>> stacked.unstack()
A B
first second
bar one 0.029399 -0.542108
two 0.282696 -0.087302
baz one -1.575170 1.771208
two 0.816482 1.100230
|
ピボットテーブル
pandasのピボットテーブルはexcelよりも素早く作成が可能です。指定項目は当然excelと同様です
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
|
>>> df = pd.DataFrame({
'A' : ['one', 'one', 'two', 'three'] * 3,
'B' : ['A', 'B', 'C'] * 4,
'C' : ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2,
'D' : np.random.randn(12),
'E' : np.random.randn(12)})
>>> df
A B C D E
0 one A foo 1.418757 -0.179666
1 one B foo -1.879024 1.291836
2 two C foo 0.536826 -0.009614
3 three A bar 1.006160 0.392149
4 one B bar -0.029716 0.264599
5 one C bar -1.146178 -0.057409
6 two A foo 0.100900 -1.425638
7 three B foo -1.035018 1.024098
8 one C foo 0.314665 -0.106062
9 one A bar -0.773723 1.824375
10 two B bar -1.170653 0.595974
11 three C bar 0.648740 1.167115
#ここまでデータ準備
>>> pd.pivot_table(df, values='D', index=['A', 'B'], columns=['C'])
C bar foo
A B
one A -0.773723 1.418757
B -0.029716 -1.879024
C -1.146178 0.314665
three A 1.006160 NaN
B NaN -1.035018
C 0.648740 NaN
two A NaN 0.100900
B -1.170653 NaN
C NaN 0.536826
|
時間の取扱について
pandasでは時間データも簡単に取り扱うことができます。
1
2
3
4
5
6
7
8
|
>>> rng = pd.date_range('1/1/2012', periods=100, freq='S')
#2012-01-01 00:00:00から「秒」単位で100秒分出力
>>> ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng)
#各秒にランダムで数値を割り当て
>>> ts.resample('5Min').sum()
#5秒周期で数値を足していく
2012-01-01 25083
Freq: 5T, dtype: int64
|
csv・excelへ出力・読み込み
CSV
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
|
>>> df.to_csv('foo.csv')
>>> pd.read_csv('foo.csv')
Unnamed: 0 A B C D
0 2000-01-01 0.266457 -0.399641 -0.219582 1.186860
1 2000-01-02 -1.170732 -0.345873 1.653061 -0.282953
2 2000-01-03 -1.734933 0.530468 2.060811 -0.515536
3 2000-01-04 -1.555121 1.452620 0.239859 -1.156896
4 2000-01-05 0.578117 0.511371 0.103552 -2.428202
5 2000-01-06 0.478344 0.449933 -0.741620 -1.962409
6 2000-01-07 1.235339 -0.091757 -1.543861 -1.084753
.. ... ... ... ... ...
993 2002-09-20 -10.628548 -9.153563 -7.883146 28.313940
994 2002-09-21 -10.390377 -8.727491 -6.399645 30.914107
995 2002-09-22 -8.985362 -8.485624 -4.669462 31.367740
996 2002-09-23 -9.558560 -8.781216 -4.499815 30.518439
997 2002-09-24 -9.902058 -9.340490 -4.386639 30.105593
998 2002-09-25 -10.216020 -9.480682 -3.933802 29.758560
999 2002-09-26 -11.856774 -10.671012 -3.216025 29.369368
[1000 rows x 5 columns]
|
EXCEL
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
|
>>> df.to_excel('foo.xlsx', sheet_name='Sheet1')
>>> pd.read_excel('foo.xlsx', 'Sheet1', index_col=None, na_values=['NA'])
A B C D
2000-01-01 0.266457 -0.399641 -0.219582 1.186860
2000-01-02 -1.170732 -0.345873 1.653061 -0.282953
2000-01-03 -1.734933 0.530468 2.060811 -0.515536
2000-01-04 -1.555121 1.452620 0.239859 -1.156896
2000-01-05 0.578117 0.511371 0.103552 -2.428202
2000-01-06 0.478344 0.449933 -0.741620 -1.962409
2000-01-07 1.235339 -0.091757 -1.543861 -1.084753
... ... ... ... ...
2002-09-20 -10.628548 -9.153563 -7.883146 28.313940
2002-09-21 -10.390377 -8.727491 -6.399645 30.914107
2002-09-22 -8.985362 -8.485624 -4.669462 31.367740
2002-09-23 -9.558560 -8.781216 -4.499815 30.518439
2002-09-24 -9.902058 -9.340490 -4.386639 30.105593
2002-09-25 -10.216020 -9.480682 -3.933802 29.758560
2002-09-26 -11.856774 -10.671012 -3.216025 29.369368
[1000 rows x 4 columns]
|
0 コメント:
コメントを投稿