Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 78 additions & 0 deletions pandas_DataFrame.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
"""
Dataframe is 2-D labeled data structure with different types.
"""

import pandas as pd
import numpy as np
import random

# one of two important data, DataFrame: 2-D labeled data structure
# create from dict
df1 = pd.DataFrame({'id': [100, 101, 102], 'color': ['red', 'blue', 'green']})
print('--------------df1-------------')
print(df1)
# the dict no order, to want the column ordered,
# column and index label argument can be used to be signed
df2 = pd.DataFrame({'id': [100, 101, 102],
'color': ['red', 'blue', 'green']},
columns=['id', 'color'],
index=['a', 'b', 'c'])
print('--------------df2-------------')
print(df2)

# create from list of lists,
# default index and column name will start from 0
df3 = pd.DataFrame([[100, 'red'], [101, 'blue'], [102, 'green']])
print('--------------df3-------------')
print(df3)
df4 = pd.DataFrame([[100, 'red'], [101, 'blue'], [102, 'green']],
columns=['id', 'color'],
index=['a', 'b', 'c'])
print('--------------df4-------------')
print(df4)

# create from ndarry
arr = np.random.rand(4, 2) # create 4*2 ndarry random numbers
print('--------------arr-------------')
print(arr)
df5 = pd.DataFrame(arr, columns=['one', 'two'])
print('--------------df5-------------')
print(df5)

df6 = pd.DataFrame({'Id': np.arange(100, 110, 1),
'scores': np.random.randint(60, 101, 10)})
print('--------------df6-------------')
print(df6)
# to reset the index
df7 = df6.set_index('Id')
print('--------------df7-------------')
print(df7)
# to drop index name
df8 = df7.rename_axis(None)
print('--------------d8-------------')
print(df8)

# create from structured or record array
data = np.zeros((2,), dtype=[('A', 'i4'), ('B', 'f8'), ('C', 'a10')])
# i4--int32, f8--float64, a10--10 char string
print('--------------data-------------')
print(data)
data[:] = [(1, 2, 'Hello'), (3, 4., 'World')]
print('--------------revised data-------------')
print(data)
df9 = pd.DataFrame.from_records(data, index='C')
print('--------------df9-------------')
print(df9)

# column selection, addition, deletion
print('------DataFrame column Selection-------')
print(df9['A'])
print(df9.B) # can not be used if white_space in column name
df9['D'] = [2, 3.5]
df9['E'] = df9.A * df9.B
print('------DataFrame column addition-------')
print(df9)
del df9['B']
df9.pop('A')
print('------DataFrame column deletion-------')
print(df9)
64 changes: 64 additions & 0 deletions pandas_Series.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
"""
one of the most popular python library to manipulate and analyze data
"""

import pandas as pd
import numpy as np

# one of two important data, Series: one dimensional labeled array
# holding data types: str, int, float, python object etc.
# method 1: create from ndarray, if given index,
# index length should be as same as the array length
x = [_ for _ in range(100, 105, 1)]
sr1 = pd.Series(x, name="ID",
index=['a', 'b', 'c', 'd', 'e'])
print('-----sr1-----')
print(sr1)

# method 2: create from dict
d = {'a': 0, 'b': 1, 'c': 2, 'd': 3.0}
sr2 = pd.Series(d)
print('-----sr2-----')
print(sr2)

# data type changed
d = {'a': 0, 'b': 1, 'c': 'c', 'd': 3.0}
sr3 = pd.Series(d)
print('-----sr3-----')
print(sr3)

# NaN is standard missing data marker in pandas
# pay attention to the order
sr4 = pd.Series(d, index=['a', 'b', 'c', 'e', 'd'])
print('-----sr4-----')
print(sr4)

# from scalar value, index must be provided to tell the length
sr5 = pd.Series(2, index=['a', 'b', 'c'], name='scalar')
print('-----sr5-----')
print(sr5)

# ndarray-like and dict-like, it is mutable,
# and can be manipulated like dict and ndarray
# int type can not manipulate sqrt, exp !!!!
sr3['c'] = 2
print('-----changed sr3-----')
print(sr3)
print(sr3[1])
print(sr3+sr3)
print(sr3*3)
# print(np.sqrt(sr3)) raise AttributeError
print(sr3[sr3 >= 2])
print(sr3)
sr3['e'] = 4.0
sr3 = pd.Series(sr3, dtype=float, name='example')
print('---sr3 add item and name, change type---')
print(sr3)
print(np.sqrt(sr3))
print(sr3.name)
sr6 = sr3.rename("changed") # after pandas v 18
print('-----sr6-----')
print(sr6)
print(sr6.name)
print(sr3)
print(pd.__version__)