A Series is a one-dimensional array-like object containing a sequence of values (of similar types to NumPy types) and an associated array of data labels, called its index.
# Create series
import pandas as pd
s = pd.Series([4, 7, -5, 3])
s
0 4 1 7 2 -5 3 3 dtype: int64
# Create series with index
obj2 = pd.Series([4, 7, -5, 3], index=["d", "b", "a", "c"])
obj2
#Change value by index
obj2['b'] = 10
obj2
# Filter with boolean array
obj2[obj2 > 1 ]
# scalar multiplication
obj2 * 2
#apply math functions
import numpy as np
np.exp(obj2)
# Check if contains index
"b" in obj2
#Create series from dictionary
sdata = {"Ohio": 35000, "Texas": 71000, "Oregon": 16000, "Utah": 5000}
obj3 = pd.Series(sdata)
obj3
#Series to dictionary
obj3.to_dict()
A DataFrame represents a rectangular table of data composed of series objects share the same index
# Create from dictionary
data = {"state": ["Ohio", "Ohio", "Ohio", "Nevada", "Nevada", "Nevada"],
"year": [2000, 2001, 2002, 2001, 2002, 2003],
"pop": [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
df = pd.DataFrame(data)
# Show all data
df
# Read data from csv file
df = pd.read_csv('galton.csv')
df
family | father | mother | midparentHeight | children | childNum | gender | childHeight | |
---|---|---|---|---|---|---|---|---|
0 | 1 | 78.5 | 67.0 | 75.43 | 4 | 1 | male | 73.2 |
1 | 1 | 78.5 | 67.0 | 75.43 | 4 | 2 | female | 69.2 |
2 | 1 | 78.5 | 67.0 | 75.43 | 4 | 3 | female | 69.0 |
3 | 1 | 78.5 | 67.0 | 75.43 | 4 | 4 | female | 69.0 |
4 | 2 | 75.5 | 66.5 | 73.66 | 4 | 1 | male | 73.5 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
929 | 203 | 62.0 | 66.0 | 66.64 | 3 | 1 | male | 64.0 |
930 | 203 | 62.0 | 66.0 | 66.64 | 3 | 2 | female | 62.0 |
931 | 203 | 62.0 | 66.0 | 66.64 | 3 | 3 | female | 61.0 |
932 | 204 | 62.5 | 63.0 | 65.27 | 2 | 1 | male | 66.5 |
933 | 204 | 62.5 | 63.0 | 65.27 | 2 | 2 | female | 57.0 |
934 rows × 8 columns
# Dataframe metadata
df.info()
# Show the first 3 rows
df.head(3)
# Show the last 3 rows
df.tail(3)
# Show columns
df.columns
# Select column by name
df['childHeight']
# Select column by name, option 2
df.childHeight
# Create new column with constant value
df['family_size'] = 'small'
df
# Convert to Numpy array
df.to_numpy()
# Transpose dataframe
df.T
# Select one column
df['mother']
# Select columns
df[['family', 'children']]
# Select rows by index
df.loc[0]
# Select by sequence index
df.iloc[0]
# Select rows by range
df.loc[1:3]
# Select rows and columns
df.loc[1:3 ,['childHeight', 'gender']]
# Filter criteria
df[df['children'] > 3]
# apply funtion
#TODO: find family_size
df['family_size'] = df.apply(??, axis=1)
df
# common array statistics: min, max, mean, sum,...
df.max()
# map function
def my_format(x):
return f"{x:.1f}"
df['midparentHeight'] = df[' '].map(my_format)
df
#applymap function
#df['midparentHeight'] = df['midparentHeight'].astype(float)
df[['father', 'mother', 'midparentHeight']].applymap(my_format)
# Sort by columns
df.sort_values(["family", "childHeight"])
family | father | mother | midparentHeight | children | childNum | gender | childHeight | |
---|---|---|---|---|---|---|---|---|
2 | 1 | 78.5 | 67.0 | 75.43 | 4 | 3 | female | 69.0 |
3 | 1 | 78.5 | 67.0 | 75.43 | 4 | 4 | female | 69.0 |
1 | 1 | 78.5 | 67.0 | 75.43 | 4 | 2 | female | 69.2 |
0 | 1 | 78.5 | 67.0 | 75.43 | 4 | 1 | male | 73.2 |
32 | 10 | 74.0 | 65.5 | 72.37 | 1 | 1 | female | 65.5 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
427 | 99 | 69.0 | 66.0 | 70.14 | 8 | 5 | female | 65.5 |
426 | 99 | 69.0 | 66.0 | 70.14 | 8 | 4 | male | 71.5 |
425 | 99 | 69.0 | 66.0 | 70.14 | 8 | 3 | male | 71.7 |
424 | 99 | 69.0 | 66.0 | 70.14 | 8 | 2 | male | 72.0 |
423 | 99 | 69.0 | 66.0 | 70.14 | 8 | 1 | male | 73.0 |
934 rows × 8 columns
# Sort by columns, descending
df.sort_values(["family", "childHeight"], ascending = False)
family | father | mother | midparentHeight | children | childNum | gender | childHeight | |
---|---|---|---|---|---|---|---|---|
423 | 99 | 69.0 | 66.0 | 70.14 | 8 | 1 | male | 73.0 |
424 | 99 | 69.0 | 66.0 | 70.14 | 8 | 2 | male | 72.0 |
425 | 99 | 69.0 | 66.0 | 70.14 | 8 | 3 | male | 71.7 |
426 | 99 | 69.0 | 66.0 | 70.14 | 8 | 4 | male | 71.5 |
427 | 99 | 69.0 | 66.0 | 70.14 | 8 | 5 | female | 65.5 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
32 | 10 | 74.0 | 65.5 | 72.37 | 1 | 1 | female | 65.5 |
0 | 1 | 78.5 | 67.0 | 75.43 | 4 | 1 | male | 73.2 |
1 | 1 | 78.5 | 67.0 | 75.43 | 4 | 2 | female | 69.2 |
2 | 1 | 78.5 | 67.0 | 75.43 | 4 | 3 | female | 69.0 |
3 | 1 | 78.5 | 67.0 | 75.43 | 4 | 4 | female | 69.0 |
934 rows × 8 columns
# Create dataframe with dupolicate index values
import numpy as np
obj = pd.DataFrame(np.arange(5), index=["a", "a", "b", "b", "c"], columns=['col_1'])
obj
col_1 | |
---|---|
a | 0 |
a | 1 |
b | 2 |
b | 3 |
c | 4 |
# Check if index is unique
obj.index.is_unique
False
# Access duplicate index
obj.loc['b']
col_1 | |
---|---|
b | 2 |
b | 3 |
# Create dataframe with some missing values
obj = pd.DataFrame([[1.4, np.nan],
[7.1, -4.5],
[np.nan, np.nan], [0.75, -1.3]],
index=["a", "b", "c", "d"],
columns=["one", "two"])
obj
one | two | |
---|---|---|
a | 1.40 | NaN |
b | 7.10 | -4.5 |
c | NaN | NaN |
d | 0.75 | -1.3 |
# Check missing values
obj.isna().sum()
one 1 two 2 dtype: int64
obj.isna().sum()
# sum
df.sum(mn )
family 1111222233444445555556777777888910111111111111... father 64630.1 mother 59859.4 midparentHeight 64639.126 children 5764 childNum 3349 gender malefemalefemalefemalemalemalefemalefemalemale... childHeight 62340.7 dtype: object
# mean
df.mean()
/var/folders/h2/b7pvzcsn4ylbpbhjmh2sl5m40000gn/T/ipykernel_4281/3239134474.py:2: FutureWarning: The default value of numeric_only in DataFrame.mean is deprecated. In a future version, it will default to False. In addition, specifying 'numeric_only=None' is deprecated. Select only valid columns or specify the value of numeric_only to silence this warning. df.mean()
father 69.197109 mother 64.089293 midparentHeight 69.206773 children 6.171306 childNum 3.585653 childHeight 66.745931 dtype: float64
# index value where the maximum values are attained
df.loc[df["childHeight"]. ()]
family 72 father 70.0 mother 65.0 midparentHeight 70.1 children 7 childNum 1 gender male childHeight 79.0 Name: 292, dtype: object
# Show summary statistics
df.describe()
father | mother | midparentHeight | children | childNum | childHeight | |
---|---|---|---|---|---|---|
count | 934.000000 | 934.000000 | 934.000000 | 934.000000 | 934.000000 | 934.000000 |
mean | 69.197109 | 64.089293 | 69.206773 | 6.171306 | 3.585653 | 66.745931 |
std | 2.476479 | 2.290886 | 1.802370 | 2.729025 | 2.361410 | 3.579251 |
min | 62.000000 | 58.000000 | 64.400000 | 1.000000 | 1.000000 | 56.000000 |
25% | 68.000000 | 63.000000 | 68.140000 | 4.000000 | 2.000000 | 64.000000 |
50% | 69.000000 | 64.000000 | 69.248000 | 6.000000 | 3.000000 | 66.500000 |
75% | 71.000000 | 65.875000 | 70.140000 | 8.000000 | 5.000000 | 69.700000 |
max | 78.500000 | 70.500000 | 75.430000 | 15.000000 | 15.000000 | 79.000000 |
Correlation summarizes the relationship between two variables
df.plot.scatter(x='father', y='childHeight');
#df.plot.scatter(x='mother', y='childHeight');
df.plot.scatter(x='midparentHeight', y='childHeight');
df.corr()
/var/folders/h2/b7pvzcsn4ylbpbhjmh2sl5m40000gn/T/ipykernel_4281/1134722465.py:1: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning. df.corr()
father | mother | midparentHeight | children | childNum | childHeight | |
---|---|---|---|---|---|---|
father | 1.000000 | 0.060366 | 0.728439 | -0.151333 | -0.087446 | 0.266039 |
mother | 0.060366 | 1.000000 | 0.727834 | -0.033582 | -0.019405 | 0.201322 |
midparentHeight | 0.728439 | 0.727834 | 1.000000 | -0.127016 | -0.073395 | 0.320950 |
children | -0.151333 | -0.033582 | -0.127016 | 1.000000 | 0.577838 | -0.126720 |
childNum | -0.087446 | -0.019405 | -0.073395 | 0.577838 | 1.000000 | -0.625026 |
childHeight | 0.266039 | 0.201322 | 0.320950 | -0.126720 | -0.625026 | 1.000000 |
# Get unique values
df['children'].unique()
array([ 4, 2, 5, 6, 1, 3, 8, 9, 7, 11, 10, 15])
# value counts
df['children'].value_counts()
5 140 8 136 4 124 7 119 6 108 3 66 9 63 2 46 11 44 10 40 1 33 15 15 Name: children, dtype: int64
Data loading functions has a long list of optional arguments
for example: pandas.read_csv has around 50 argument
type inference: That means you don’t necessarily have to specify which columns are numeric, integer, Boolean, or string
# Read CSV file
df = pd.read_csv("galton.csv")
df
family | father | mother | midparentHeight | children | childNum | gender | childHeight | |
---|---|---|---|---|---|---|---|---|
0 | 1 | 78.5 | 67.0 | 75.43 | 4 | 1 | male | 73.2 |
1 | 1 | 78.5 | 67.0 | 75.43 | 4 | 2 | female | 69.2 |
2 | 1 | 78.5 | 67.0 | 75.43 | 4 | 3 | female | 69.0 |
3 | 1 | 78.5 | 67.0 | 75.43 | 4 | 4 | female | 69.0 |
4 | 2 | 75.5 | 66.5 | 73.66 | 4 | 1 | male | 73.5 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
929 | 203 | 62.0 | 66.0 | 66.64 | 3 | 1 | male | 64.0 |
930 | 203 | 62.0 | 66.0 | 66.64 | 3 | 2 | female | 62.0 |
931 | 203 | 62.0 | 66.0 | 66.64 | 3 | 3 | female | 61.0 |
932 | 204 | 62.5 | 63.0 | 65.27 | 2 | 1 | male | 66.5 |
933 | 204 | 62.5 | 63.0 | 65.27 | 2 | 2 | female | 57.0 |
934 rows × 8 columns
# Define index
df2 = pd.read_csv("galton.csv", index_col = 'family')
df2.head()
father | mother | midparentHeight | children | childNum | gender | childHeight | |
---|---|---|---|---|---|---|---|
family | |||||||
1 | 78.5 | 67.0 | 75.43 | 4 | 1 | male | 73.2 |
1 | 78.5 | 67.0 | 75.43 | 4 | 2 | female | 69.2 |
1 | 78.5 | 67.0 | 75.43 | 4 | 3 | female | 69.0 |
1 | 78.5 | 67.0 | 75.43 | 4 | 4 | female | 69.0 |
2 | 75.5 | 66.5 | 73.66 | 4 | 1 | male | 73.5 |
df2.iloc[0]
df2.loc[0]
# Define hierarchical index
df_3 = pd.read_csv("galton.csv", index_col = ['family', 'childNum'])
df_3
father | mother | midparentHeight | children | gender | childHeight | ||
---|---|---|---|---|---|---|---|
family | childNum | ||||||
1 | 1 | 78.5 | 67.0 | 75.43 | 4 | male | 73.2 |
2 | 78.5 | 67.0 | 75.43 | 4 | female | 69.2 | |
3 | 78.5 | 67.0 | 75.43 | 4 | female | 69.0 | |
4 | 78.5 | 67.0 | 75.43 | 4 | female | 69.0 | |
2 | 1 | 75.5 | 66.5 | 73.66 | 4 | male | 73.5 |
... | ... | ... | ... | ... | ... | ... | ... |
203 | 1 | 62.0 | 66.0 | 66.64 | 3 | male | 64.0 |
2 | 62.0 | 66.0 | 66.64 | 3 | female | 62.0 | |
3 | 62.0 | 66.0 | 66.64 | 3 | female | 61.0 | |
204 | 1 | 62.5 | 63.0 | 65.27 | 2 | male | 66.5 |
2 | 62.5 | 63.0 | 65.27 | 2 | female | 57.0 |
934 rows × 6 columns
# Write csv file
df.to_csv('new path')
# Save(Serialize) df as binary file
df.to_pickle("df.pickle")
# Read dataframe from pickle file
pd.read_pickle('df.pickle')
family | father | mother | midparentHeight | children | childNum | gender | childHeight | |
---|---|---|---|---|---|---|---|---|
0 | 1 | 78.5 | 67.0 | 75.43 | 4 | 1 | male | 73.2 |
1 | 1 | 78.5 | 67.0 | 75.43 | 4 | 2 | female | 69.2 |
2 | 1 | 78.5 | 67.0 | 75.43 | 4 | 3 | female | 69.0 |
3 | 1 | 78.5 | 67.0 | 75.43 | 4 | 4 | female | 69.0 |
4 | 2 | 75.5 | 66.5 | 73.66 | 4 | 1 | male | 73.5 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
929 | 203 | 62.0 | 66.0 | 66.64 | 3 | 1 | male | 64.0 |
930 | 203 | 62.0 | 66.0 | 66.64 | 3 | 2 | female | 62.0 |
931 | 203 | 62.0 | 66.0 | 66.64 | 3 | 3 | female | 61.0 |
932 | 204 | 62.5 | 63.0 | 65.27 | 2 | 1 | male | 66.5 |
933 | 204 | 62.5 | 63.0 | 65.27 | 2 | 2 | female | 57.0 |
934 rows × 8 columns
pd.read_excel("grocery.xlsx", sheet_name = 'prices')
Item | Mawares | Abo Ahmad | Bravo | Karaz | Max mar | Shini | Baghdad | Gardens | |
---|---|---|---|---|---|---|---|---|---|
0 | Lemon | 2.3 | 2.0 | 3.0 | 3.0 | 3.5 | 2.7 | 3.1 | 4.0 |
1 | Tomato | 2.0 | 2.0 | 2.2 | 2.5 | 3.0 | 2.0 | 3.0 | 2.4 |
2 | Potato | 3.3 | 3.3 | 3.3 | 4.0 | 5.0 | 3.0 | 3.5 | 3.5 |
3 | Watermelon | 1.0 | 1.5 | 1.5 | 1.5 | 3.0 | 2.0 | 3.0 | 3.0 |
4 | Zucchini | 12.0 | 8.0 | 8.0 | 8.0 | 12.0 | 7.0 | 8.0 | 10.0 |
5 | Eggplant | 3.0 | 2.5 | 3.0 | 3.0 | 5.0 | 3.0 | 5.0 | 5.0 |
6 | Apple | 7.0 | 7.0 | 8.0 | 7.0 | 13.0 | 8.0 | 10.0 | 12.0 |
7 | Panana | 3.3 | 3.3 | 3.0 | 4.0 | 4.0 | 3.5 | 5.0 | 5.0 |
import requests
url = "https://dummyjson.com/products/2"
resp = requests.get(url)
data = resp.json()
data
{'id': 2, 'title': 'iPhone X', 'description': 'SIM-Free, Model A19211 6.5-inch Super Retina HD display with OLED technology A12 Bionic chip with ...', 'price': 899, 'discountPercentage': 17.94, 'rating': 4.44, 'stock': 34, 'brand': 'Apple', 'category': 'smartphones', 'thumbnail': 'https://cdn.dummyjson.com/product-images/2/thumbnail.jpg', 'images': ['https://cdn.dummyjson.com/product-images/2/1.jpg', 'https://cdn.dummyjson.com/product-images/2/2.jpg', 'https://cdn.dummyjson.com/product-images/2/3.jpg', 'https://cdn.dummyjson.com/product-images/2/thumbnail.jpg']}
pd.DataFrame(data)
id | title | description | price | discountPercentage | rating | stock | brand | category | thumbnail | images | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2 | iPhone X | SIM-Free, Model A19211 6.5-inch Super Retina H... | 899 | 17.94 | 4.44 | 34 | Apple | smartphones | https://cdn.dummyjson.com/product-images/2/thu... | https://cdn.dummyjson.com/product-images/2/1.jpg |
1 | 2 | iPhone X | SIM-Free, Model A19211 6.5-inch Super Retina H... | 899 | 17.94 | 4.44 | 34 | Apple | smartphones | https://cdn.dummyjson.com/product-images/2/thu... | https://cdn.dummyjson.com/product-images/2/2.jpg |
2 | 2 | iPhone X | SIM-Free, Model A19211 6.5-inch Super Retina H... | 899 | 17.94 | 4.44 | 34 | Apple | smartphones | https://cdn.dummyjson.com/product-images/2/thu... | https://cdn.dummyjson.com/product-images/2/3.jpg |
3 | 2 | iPhone X | SIM-Free, Model A19211 6.5-inch Super Retina H... | 899 | 17.94 | 4.44 | 34 | Apple | smartphones | https://cdn.dummyjson.com/product-images/2/thu... | https://cdn.dummyjson.com/product-images/2/thu... |
Navigate Kaggle and select a data set that contains at least 5 features and have both categorical and numerical features.