posts_data = [
{"id": 1, "ts": "2014-01-01T10:00", "name": "Zbychu",
"message": "hello world", "likes": 16},
{"id": 2, "ts": "2014-01-01T13:37", "name": "Zbychu",
"likes": 20},
{"id": 3, "ts": "2014-01-01T15:35", "name": "biedny student",
"message": "pandas 3 bardzo proszę pls", "likes": 3},
{"id": 4, "ts": "2014-01-02T19:42", "name": "julius_caesar",
"message": "veni, vidi, vici", "likes": 22},
{"id": 5, "ts": "2014-02-04T12:02", "name": "julius_caesar",
"message": "coś po łacinie", "likes": 10},
{"id": 6, "ts": "2014-04-15T23:15", "name": "kotek",
"message": "miau miau miau bardzo miau", "likes": 5},
{"id": 7, "ts": "2014-04-15T18:30", "name": "Zbychu",
"message": "hello pandas hello hello", "likes": 7},
]
import pandas
posts = pandas.DataFrame(posts_data)
posts
id | likes | message | name | ts | |
---|---|---|---|---|---|
0 | 1 | 16 | hello world | Zbychu | 2014-01-01T10:00 |
1 | 2 | 20 | NaN | Zbychu | 2014-01-01T13:37 |
2 | 3 | 3 | pandas 3 bardzo proszę pls | biedny student | 2014-01-01T15:35 |
3 | 4 | 22 | veni, vidi, vici | julius_caesar | 2014-01-02T19:42 |
4 | 5 | 10 | coś po łacinie | julius_caesar | 2014-02-04T12:02 |
5 | 6 | 5 | miau miau miau bardzo miau | kotek | 2014-04-15T23:15 |
6 | 7 | 7 | hello pandas hello hello | Zbychu | 2014-04-15T18:30 |
7 rows × 5 columns
posts = posts.sort("likes", ascending=False).fillna("[ No Comment ]")
posts
id | likes | message | name | ts | |
---|---|---|---|---|---|
3 | 4 | 22 | veni, vidi, vici | julius_caesar | 2014-01-02T19:42 |
1 | 2 | 20 | [ No Comment ] | Zbychu | 2014-01-01T13:37 |
0 | 1 | 16 | hello world | Zbychu | 2014-01-01T10:00 |
4 | 5 | 10 | coś po łacinie | julius_caesar | 2014-02-04T12:02 |
6 | 7 | 7 | hello pandas hello hello | Zbychu | 2014-04-15T18:30 |
5 | 6 | 5 | miau miau miau bardzo miau | kotek | 2014-04-15T23:15 |
2 | 3 | 3 | pandas 3 bardzo proszę pls | biedny student | 2014-01-01T15:35 |
7 rows × 5 columns
posts.loc[posts.likes >= 10, "message"] = posts["message"].str.decode("utf8").str.upper()
posts
id | likes | message | name | ts | |
---|---|---|---|---|---|
3 | 4 | 22 | VENI, VIDI, VICI | julius_caesar | 2014-01-02T19:42 |
1 | 2 | 20 | [ NO COMMENT ] | Zbychu | 2014-01-01T13:37 |
0 | 1 | 16 | HELLO WORLD | Zbychu | 2014-01-01T10:00 |
4 | 5 | 10 | COŚ PO ŁACINIE | julius_caesar | 2014-02-04T12:02 |
6 | 7 | 7 | hello pandas hello hello | Zbychu | 2014-04-15T18:30 |
5 | 6 | 5 | miau miau miau bardzo miau | kotek | 2014-04-15T23:15 |
2 | 3 | 3 | pandas 3 bardzo proszę pls | biedny student | 2014-01-01T15:35 |
7 rows × 5 columns
posts_id = posts.set_index("id")
posts_id.sort_index().ix[3:6]
likes | message | name | ts | |
---|---|---|---|---|
id | ||||
3 | 3 | pandas 3 bardzo proszę pls | biedny student | 2014-01-01T15:35 |
4 | 22 | VENI, VIDI, VICI | julius_caesar | 2014-01-02T19:42 |
5 | 10 | COŚ PO ŁACINIE | julius_caesar | 2014-02-04T12:02 |
6 | 5 | miau miau miau bardzo miau | kotek | 2014-04-15T23:15 |
4 rows × 4 columns
posts_ts = posts.set_index("ts").sort_index()
posts_ts.index = posts_ts.index.to_datetime()
posts_ts.ix["2014-01-01 12:00":"2014-01-01T16:00"]
id | likes | message | name | |
---|---|---|---|---|
2014-01-01 13:37:00 | 2 | 20 | [ NO COMMENT ] | Zbychu |
2014-01-01 15:35:00 | 3 | 3 | pandas 3 bardzo proszę pls | biedny student |
2 rows × 4 columns
posts_warsaw = posts_ts.tz_localize("UTC").tz_convert("Europe/Warsaw")
posts_warsaw
id | likes | message | name | |
---|---|---|---|---|
2014-01-01 11:00:00+01:00 | 1 | 16 | HELLO WORLD | Zbychu |
2014-01-01 14:37:00+01:00 | 2 | 20 | [ NO COMMENT ] | Zbychu |
2014-01-01 16:35:00+01:00 | 3 | 3 | pandas 3 bardzo proszę pls | biedny student |
2014-01-02 20:42:00+01:00 | 4 | 22 | VENI, VIDI, VICI | julius_caesar |
2014-02-04 13:02:00+01:00 | 5 | 10 | COŚ PO ŁACINIE | julius_caesar |
2014-04-15 20:30:00+02:00 | 7 | 7 | hello pandas hello hello | Zbychu |
2014-04-16 01:15:00+02:00 | 6 | 5 | miau miau miau bardzo miau | kotek |
7 rows × 4 columns
posts_by_day = posts_ts.resample("D", how="sum").dropna()
posts_by_day
id | likes | |
---|---|---|
2014-01-01 | 6 | 39 |
2014-01-02 | 4 | 22 |
2014-02-04 | 5 | 10 |
2014-04-15 | 13 | 12 |
4 rows × 2 columns
by_month = posts_ts.pivot_table(
"likes", cols="name", rows=posts_ts.index
).resample("M", how="sum").sort_index(ascending=False)
by_month
name | Zbychu | biedny student | julius_caesar | kotek |
---|---|---|---|---|
2014-04-30 | 7 | NaN | NaN | 5 |
2014-03-31 | NaN | NaN | NaN | NaN |
2014-02-28 | NaN | NaN | 10 | NaN |
2014-01-31 | 36 | 3 | 22 | NaN |
4 rows × 4 columns
%matplotlib inline
plots = by_month.plot(kind="bar", figsize=(10, 6), title="Likes per month per user")
import csv
def the_manual_way():
with open("data.csv") as f:
reader = csv.reader(f)
data = []
for row in reader:
data.append({"ts": parse(row[0]), "name": row[1]})
print "I wish I've known pandas"
# ... furious data wrangling ...
import pandas
def the_pandas_way():
data = pandas.read_csv("data.csv")
# use pandas, Luke!
data.set_index("ts").sort_index()["2014-01-01":].groupby("name")["likes"].sum()
Wes McKinney - Python for Data Analysis (książka autora pandas) http://shop.oreilly.com/product/0636920023784.do
Prezentacja dostępna pod adresem: http://pawroman.github.io/pywaw-pandas/
Wpisy na blogu: http://piesnakod.pl/tag/pandas/
Kod prezentacji: https://github.com/pawroman/pywaw-pandas
Wykonano w IPython notebook http://ipython.org/notebook