Using Pandas in Python
Hello all,
This is the fourth article in the series Python for Data Science. If you are new to this series, we would recommend you to read our previous articles
To install pandas in your python environment, please use the below command
pip install pandas
Please refer the videos below for detailed explanation on Pandas
After you have installed Pandas, please refer the following notebook to understand on how to use Pandas functionalities.
import pandas as pd
import os
Creating a DataFrame¶
data1 = [1,2,3]
data2 = [4,5,6]
data3 = [7,8,9]
pd.DataFrame([data1,data2,data3],columns = ['a','b','c'],index = ['row1','row2','row3'])
pd.DataFrame([data1,data2,data3])
Reading CSV thru Pandas¶
csv_file_path = r"E:\openknowledgeshare.blogspot.com\Python\Data\SBIN.NS.csv"
df = pd.read_csv(csv_file_path)
df
df.shape
df.columns
df.rename(columns={'Date':'NewDate'},inplace=True)
df
df.head()
df.head(10)
df.tail()
df.describe()
df.head()
df.set_index('NewDate').head()
df["High"].head()
type(df)
type(df["High"])
df["High"].to_frame().head()
df.iloc[10:20]
df.iloc[10:20][["Close","High"]]
df.iloc[[1,3,10]][["Close","High"]]
a = df.head().values
a
a.shape
type(a)
df.head()
Manipulating Cell Values of Dataframe¶
data1 = [1,2,3]
data2 = [4,5,6]
data3 = [7,8,9]
df1 = pd.DataFrame([data1,data2,data3],columns = ['a','b','c'])
df1
df1.iloc[1]['c']
df1.iloc[1]['c'] = 10
df1
df1.iloc[2]
type(df1.iloc[2])
df1.iloc[2] = [10,20,30]
df1
df1.iloc[[1,2]] = [[100,200,300],[1030,2010,3000]]
df1
df1.iloc[3] = [10,4,1]
Appending Row to DataFrame¶
a = [1,2,3]
a.append(5)
a
df1
df1.append([[10,8,4]])
df1
Case 1: DataFrame without Column Names¶
data1 = [1,2,3]
data2 = [4,5,6]
data3 = [7,8,9]
df2 = pd.DataFrame([data1,data2,data3])
df2
df2 = df2.append([[10,20,30]])
df2
df2.reset_index()
df2.reset_index(drop=True,inplace = True)
df2
Case2 : DataFrame with Column Names¶
df1
df1.append([[10,20,30]],ignore_index=True)
df1.append({'a':10,'b':20,'c':30},ignore_index=True)
columnnames_list = list(df1.columns)
columnnames_list
data = [1000,2000,3000]
data_dict = {}
for i in range(0,len(columnnames_list)):
each_column = columnnames_list[i]
this_data_point = data[i]
data_dict[each_column] = this_data_point
data_dict
df1 = df1.append(data_dict,ignore_index = True)
df1
df1
Sampling DataFrame¶
import numpy as np
a = [1,2,3,5,6,7]
np.random.choice(a,3)
df.sample(10)
Merge, Join and Concatenate DataFrames¶
df1
df2
df3 = pd.merge(df1,df2,left_index=True,right_index=True,how='outer')
df3
data1 = [1,2,3]
data2 = [4,5,6]
data3 = [7,8,9]
df2 = pd.DataFrame([data1,data2,data3])
df2
df3 = pd.merge(df1,df2,left_index=True,right_index=True,how='outer')
df3
df3 = pd.merge(df1,df2,left_index=True,right_index=True,how='inner')
df3
df3 = pd.merge(df1,df2,left_index=True,right_index=True,how='left')
df3
df3 = pd.merge(df1,df2,left_index=True,right_index=True,how='right')
df3
df1 = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3']})
df2 = pd.DataFrame({'key': ['K0', 'K1', 'K4', 'K3'],
'C': ['C0', 'C1', 'C4', 'C3'],
'D': ['D0', 'D1', 'D4', 'D3']})
df1
df2
df3 = pd.merge(df1,df2,left_index=True,right_index=True,how='outer')
df3
df3 = pd.merge(df1,df2,on ='key',how='outer')
df3
df3 = pd.merge(df1,df2,on ='key',how='inner')
df3
df3 = pd.merge(df1,df2,on ='key',how='left')
df3
df3 = pd.merge(df1,df2,on ='key',how='right')
df3
Concat¶
s3 = pd.Series([0, 1, 2, 3], name='foo')
s4 = pd.Series([4, 5, 6, 7,12])
s5 = pd.Series([11, 10, 9, 8,13,14])
s3
s4
s5
pd.concat([s3, s4, s5], axis=1)
pd.concat([s3, s4, s5], axis=0)
df1
df2
df1.append(df2)
df3 = pd.concat([df1,df2], ignore_index=True, sort=False)
df3
Changing Column Order¶
df3[['A','B','C','D','key']]
Handling Missing Values¶
df3
df4 = df.head().copy()
df4
df3
df4
df5 = pd.concat([df3,df4],ignore_index=True, sort=False)
df5
df5.isna()
df5.fillna("EMPTY")
df5.replace(np.nan,0,inplace = True)
df5
Filtering DataFrame¶
df5['key']
df5[df5['key'] == 'K0']
df5[(df5['key'] == 'K4') & (df5['Open'] == 0)]
keylist = ['K2','K3']
df5[(df5['key'].isin(keylist)) & (df5['Open'] == 0)]
df5[df5['Open'] < 100]
Renaming Columns¶
temp = df.sample(10).copy()
temp
temp.rename(columns = {'NewDate':'OneMoreNewDate','Adj Close':'ABC'},inplace = True)
temp
temp.columns = ['A','B','CD','EF','GH','i',3939]
temp
Deleting Column¶
temp.drop("B",axis=1,inplace = True)
temp
temp.drop(724,axis=0,inplace = True)
temp
temp.drop([1499,1145],axis=0,inplace = True)
temp
temp.drop("GH",axis=1,inplace = True)
temp
df1 = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3']})
df2 = pd.DataFrame({'key': ['K0', 'K1', 'K4', 'K3'],
'C': ['C0', 'C1', 'C4', 'C3'],
'D': ['D0', 'D1', 'D4', 'D3']})
df1
df1.drop('A',axis=1,inplace = True)
df1
df1.drop(2,axis=0,inplace = True)
df1
Rowwise and Column wise Fuctions¶
temp
temp['CD'].dtype
temp['CD'] = temp['CD'].astype('int32')
temp['CD']
temp
def range_indicator(input_value):
output_value = False
if(input_value > 50):
output_value = True
return output_value
range_indicator(278)
temp['CD']
temp['Range_Indicator'] = temp['CD'].apply(lambda x: range_indicator(x))
temp
lambda x: range_indicator(x)
for each_item in [12,4,5,321]:
print(range_indicator(each_item))
list(map(range_indicator,[12,4,5,321]))
list(map(range_indicator,temp['CD']))
Pandas Visualization¶
import matplotlib.pyplot as plt
df.head()
df.tail()
df.plot(x='NewDate',y = 'Open')
df.plot(x='Open',y = 'Close',kind='scatter')
df['Open'].hist()
df.shape
Statistics¶
df.head()
df.info()
df.describe()
df["Open"].hist()
df.corr()
df.head()
df.head().sum(axis=0).sum()
a = [1,2,3,4,5,10,20,50]
sum(a)/len(a)
#Standard Deviation
import numpy as np
a = np.asarray(a)
a
np.mean(a)
np.std(a)
df.head()
df1
df.apply(lambda x: x['Open']+ x['High'])
df[["Open","High","Low","Close"]].apply(lambda x: 2*x)
df["NewDate"] = pd.to_datetime(df["NewDate"],format='%Y-%m-%d')
df["NewDate"].dtype
df['NewDate'].iloc[0].day
df['NewDate'].iloc[0].date()
import datetime
datetime.timedelta(days=10,hours=10)
df['NewDate'].iloc[0] + datetime.timedelta(days=10,hours=10)
a = df['NewDate'].iloc[0] + datetime.timedelta(days=10,hours=10)
a
a.strptime()
import datetime
d = datetime.datetime.strptime("01/27/2012", "%m/%d/%Y")
d
b = datetime.datetime.strftime(d,"%Y-%m-%d")
b
type(b)
import time
time.ctime()
TimeStamp Folder Creation¶
timestamp_now = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
import os
timestamp_now = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
main_folder_path = r'E:\openknowledgeshare.blogspot.com\Python\Outputs'
os.makedirs(os.path.join(main_folder_path,timestamp_now))
Time Taken¶
start_time = time.clock()
for i in range(0,10):
i
end_time = time.clock()
print("Time taken for this code is {} seconds".format(end_time - start_time))
Stacking¶
df1
df2
df2.stack()
df2.unstack('')
import pandas as pd
import numpy as np
header = pd.MultiIndex.from_product([['Semester1','Semester2'],['Maths','Science']])
d=([[12,45,67,56],[78,89,45,67],[45,67,89,90],[67,44,56,55]])
df = pd.DataFrame(d,
columns=['Alisa','Bobby','Cathrine','Jack'],
index=header)
df
df.xs("Semester1")
df.xs("Semester2")
df.xs("Maths",level=1)
df
df.unstack()
Comments
Post a Comment