Manipulate Data Frame in Python

We can use pandas1 3 to manipulate data frame In this post, we create students data frame for demonstration

import pandas as pd
students = pd.DataFrame({
  "name": ["Peter", "Lily", "John", "Alice"],
  "sex": ["male", "female", "male", "female"],
  "birth_year" : [1995, 1998, 1997, 1996]
})

#    birth_year  name     sex
# 1        1998  Lily  female
# 2        1997  John    male

Subsetting

By columns

# select column by name
students['name']

# 0    Peter
# 1     Lily
# 2     John
# 3    Alice

# Last column
students.iloc[:, -1]

# 0      male
# 1    female
# 2      male
# 3    female

# All column except last
students.iloc[:, :-1]
 
#    birth_year   name
# 0        1995  Peter
# 1        1998   Lily
# 2        1997   John
# 3        1996  Alice

By rows

students.head(2)

#    birth_year   name     sex
# 0        1995  Peter    male
# 1        1998   Lily  female

students.tail(2)

#    birth_year   name     sex
# 2        1997   John    male
# 3        1996  Alice  female

students.loc[1:2]

#    birth_year  name     sex
# 1        1998  Lily  female
# 2        1997  John    male

# split train test
from sklearn import cross_validation
cross_validation.train_test_split(students, test_size=0.2)
# [array([[1995L, 'Peter', 'male'],
#  [1998L, 'Lily', 'female'],
#  [1996L, 'Alice', 'female']], dtype=object),
#  array([[1997L, 'John', 'male']], dtype=object)]

Manipulate columns

Add columns 2

students['nationality'] = 'US'
students

#    birth_year   name     sex nationality
# 0        1995  Peter    male          US
# 1        1998   Lily  female          US
# 2        1997   John    male          US
# 3        1996  Alice  female          US

Rename columns 4

students.rename(columns={'birth_year':'year'})

#    year   name     sex
# 0  1995  Peter    male
# 1  1998   Lily  female
# 2  1997   John    male
# 3  1996  Alice  female

Merging 5

We consider two data frames dogs and owners

dogs = pd.DataFrame({
  'name' : ['Max', 'Lucy', 'Charlie'],
  'age' : [1, 2, 1.5],
  'owner': ['Smith', 'John', 'Mary']
})

#    age     name    owner
# 0  1.0      Max    Smith
# 1  2.0     Lucy     John
# 2  1.5  Charlie     Mary

owners = pd.DataFrame({
  'name': ['Smith', 'John', 'Jennifer'],
  'country': ['USA', 'English', 'USA']
})

#    country      name
# 0      USA     Smith
# 1  English      John
# 2      USA  Jennifer

Outer Join

pd.merge(dogs, owners,
         left_on='owner', right_on='name',
         how='outer')
#    age   name_x    owner  country    name_y
# 0  1.0      Max    Smith      USA     Smith
# 1  2.0     Lucy     John  English      John
# 2  1.5  Charlie     Mary      NaN       NaN
# 3  NaN      NaN      NaN      USA  Jennifer

Inner Join

pd.merge(dogs, owners,
         left_on='owner', right_on='name',
         how='inner')

#    age name_x    owner  country   name_y
# 0    1    Max    Smith      USA    Smith
# 1    2   Lucy  Johnson  English     John

Left Join

pd.merge(dogs, owners,
         left_on='owner', right_on='name',
         how='left')

#    age   name_x    owner  country   name_y
# 0  1.0      Max    Smith      USA    Smith
# 1  2.0     Lucy     John  English     John
# 2  1.5  Charlie     Mary      NaN      NaN

Right Join

pd.merge(dogs, owners,
         left_on='owner', right_on='name',
         how='right')

#    age name_x    owner  country    name_y
# 0    1    Max    Smith      USA     Smith
# 1    2   Lucy     John  English      John
# 2  NaN    NaN      NaN      USA  Jennifer

Filtering

students = students[students.birth_year > 1996]

#    birth_year  name     sex
# 1        1998  Lily  female
# 2        1997  John    male

Sorting

students.sort('birth_year')

#    birth_year   name     sex
# 0        1995  Peter    male
# 3        1996  Alice  female
# 2        1997   John    male
# 1        1998   Lily  female

Group, Apply, Combine 6

dogs = pd.DataFrame({
  'name' : ['Max', 'Lucy', 'Charlie', 'Rocky'],
  'age' : [1, 2, 1.5, 2],
  'owner': ['Smith', 'John', 'Mary', 'Mary']
})

#    age     name  owner
# 0  1.0      Max  Smith
# 1  2.0     Lucy   John
# 2  1.5  Charlie   Mary
# 3  2.0    Rocky   Mary

# create owners data frame
# concat names of their dogs
owners = dogs.groupby('owner')['name']
             .apply(lambda x: "%s" % ' '.join(x))
             .to_frame().reset_index()

#    owner           name
# 0   John           Lucy
# 1   Mary  Charlie Rocky
# 2  Smith            Max
Advertisements

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s