Module 26 - Pandas Basics
Pandas is the essential library for data manipulation and analysis in Python, built on top of NumPy and providing DataFrames for working with tabular data.
1. Introduction to Pandas
Installation
pip install pandas
Core Data Structures
| Structure | Description | Analogy |
|---|---|---|
| Series | 1D labeled array | Single column |
| DataFrame | 2D labeled array | Excel spreadsheet |
import pandas as pd
# Series - 1D
series = pd.Series([10, 20, 30, 40])
print(series)
# DataFrame - 2D
data = {
'Name': ['Alice', 'Bob', 'Charlie'],
'Age': [25, 30, 35],
'City': ['New York', 'London', 'Paris']
}
df = pd.DataFrame(data)
print(df)
2. Creating DataFrames
2.1 From Dictionary
import pandas as pd
# Simple dictionary
data = {
'Name': ['Alice', 'Bob', 'Charlie', 'David'],
'Age': [25, 30, 35, 28],
'Salary': [50000, 60000, 70000, 55000]
}
df = pd.DataFrame(data)
print(df)
2.2 From Lists
# List of lists
data = [
['Alice', 25, 50000],
['Bob', 30, 60000],
['Charlie', 35, 70000]
]
df = pd.DataFrame(data, columns=['Name', 'Age', 'Salary'])
print(df)
2.3 From NumPy Array
import numpy as np
import pandas as pd
arr = np.random.rand(4, 3)
df = pd.DataFrame(arr, columns=['A', 'B', 'C'])
print(df)
2.4 Reading from Files
# CSV
df = pd.read_csv('data.csv')
# Excel
df = pd.read_excel('data.xlsx')
# JSON
df = pd.read_json('data.json')
# SQL
import sqlite3
conn = sqlite3.connect('database.db')
df = pd.read_sql('SELECT * FROM users', conn)
3. DataFrame Basics
3.1 Viewing Data
import pandas as pd
df = pd.read_csv('data.csv')
# First/last rows
print(df.head()) # First 5 rows
print(df.head(10)) # First 10 rows
print(df.tail()) # Last 5 rows
# Basic info
print(df.shape) # (rows, columns)
print(df.columns) # Column names
print(df.dtypes) # Data types
print(df.info()) # Comprehensive info
print(df.describe()) # Statistical summary
3.2 Selecting Data
# Select single column (returns Series)
ages = df['Age']
print(ages)
# Select multiple columns (returns DataFrame)
subset = df[['Name', 'Age']]
print(subset)
# Select rows by position
print(df.iloc[0]) # First row
print(df.iloc[0:3]) # First 3 rows
print(df.iloc[0, 2]) # Row 0, Column 2
# Select rows by label
print(df.loc[0]) # Row with index 0
print(df.loc[0:2, 'Name']) # Rows 0-2, 'Name' column
4. Filtering Data
4.1 Boolean Indexing
import pandas as pd
df = pd.DataFrame({
'Name': ['Alice', 'Bob', 'Charlie', 'David'],
'Age': [25, 30, 35, 28],
'Salary': [50000, 60000, 70000, 55000]
})
# Single condition
high_earners = df[df['Salary'] > 55000]
print(high_earners)
# Multiple conditions (AND)
result = df[(df['Age'] > 25) & (df['Salary'] > 55000)]
print(result)
# Multiple conditions (OR)
result = df[(df['Age'] < 26) | (df['Salary'] > 65000)]
print(result)
# String operations
df_names = df[df['Name'].str.startswith('A')]
print(df_names)
4.2 Using .query()
# More readable filtering
result = df.query('Age > 25 and Salary > 55000')
print(result)
result = df.query('Name == "Alice" or Age > 30')
print(result)
5. Adding and Modifying Columns
5.1 Adding Columns
# Add new column
df['Bonus'] = df['Salary'] * 0.1
# Conditional column
df['Senior'] = df['Age'] > 30
# Using apply()
df['Name_Length'] = df['Name'].apply(len)
# Using lambda
df['Tax'] = df['Salary'].apply(lambda x: x * 0.2)
print(df)
5.2 Modifying Columns
# Update existing column
df['Salary'] = df['Salary'] * 1.05 # 5% raise
# Replace values
df['City'] = df['City'].replace('NYC', 'New York')
# Fill missing values
df['Age'].fillna(df['Age'].mean(), inplace=True)
5.3 Dropping Columns/Rows
# Drop column
df = df.drop('Bonus', axis=1) # axis=1 for columns
# Drop multiple columns
df = df.drop(['Col1', 'Col2'], axis=1)
# Drop rows
df = df.drop([0, 2], axis=0) # axis=0 for rows
# Drop rows with missing values
df = df.dropna()
6. Sorting and Ranking
import pandas as pd
df = pd.DataFrame({
'Name': ['Alice', 'Bob', 'Charlie', 'David'],
'Age': [25, 30, 35, 28],
'Salary': [50000, 60000, 70000, 55000]
})
# Sort by single column
sorted_df = df.sort_values('Age')
print(sorted_df)
# Sort descending
sorted_df = df.sort_values('Salary', ascending=False)
print(sorted_df)
# Sort by multiple columns
sorted_df = df.sort_values(['Age', 'Salary'], ascending=[True, False])
print(sorted_df)
# Ranking
df['Salary_Rank'] = df['Salary'].rank(ascending=False)
print(df)
7. Grouping and Aggregation
7.1 GroupBy Operations
import pandas as pd
df = pd.DataFrame({
'Department': ['Sales', 'Sales', 'IT', 'IT', 'HR'],
'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
'Salary': [50000, 60000, 70000, 65000, 55000]
})
# Group by department
grouped = df.groupby('Department')
# Aggregate functions
print(grouped['Salary'].mean())
print(grouped['Salary'].sum())
print(grouped['Salary'].count())
# Multiple aggregations
print(grouped['Salary'].agg(['mean', 'sum', 'count']))
# Custom aggregation
print(grouped.agg({
'Salary': ['mean', 'max', 'min'],
'Name': 'count'
}))
7.2 Pivot Tables
# Create pivot table
pivot = df.pivot_table(
values='Salary',
index='Department',
aggfunc='mean'
)
print(pivot)
8. Handling Missing Data
import pandas as pd
import numpy as np
df = pd.DataFrame({
'A': [1, 2, np.nan, 4],
'B': [5, np.nan, np.nan, 8],
'C': [9, 10, 11, 12]
})
# Check for missing values
print(df.isnull())
print(df.isnull().sum()) # Count per column
# Drop missing values
df_cleaned = df.dropna() # Drop any row with NaN
df_cleaned = df.dropna(axis=1) # Drop any column with NaN
# Fill missing values
df_filled = df.fillna(0) # Fill with 0
df_filled = df.fillna(df.mean()) # Fill with column mean
df_filled = df.fillna(method='ffill') # Forward fill
df_filled = df.fillna(method='bfill') # Backward fill
print(df_filled)
9. Merging and Joining
9.1 Concatenation
import pandas as pd
df1 = pd.DataFrame({'A': [1, 2], 'B': [3, 4]})
df2 = pd.DataFrame({'A': [5, 6], 'B': [7, 8]})
# Vertical concatenation (stack rows)
result = pd.concat([df1, df2], ignore_index=True)
print(result)
# Horizontal concatenation (side by side)
result = pd.concat([df1, df2], axis=1)
print(result)
9.2 Merging (SQL-like joins)
df1 = pd.DataFrame({
'user_id': [1, 2, 3],
'name': ['Alice', 'Bob', 'Charlie']
})
df2 = pd.DataFrame({
'user_id': [1, 2, 4],
'score': [85, 90, 75]
})
# Inner join (default)
merged = pd.merge(df1, df2, on='user_id')
print(merged)
# Left join
merged = pd.merge(df1, df2, on='user_id', how='left')
print(merged)
# Right join
merged = pd.merge(df1, df2, on='user_id', how='right')
print(merged)
# Outer join
merged = pd.merge(df1, df2, on='user_id', how='outer')
print(merged)
10. Practical Examples
10.1 Data Cleaning Pipeline
import pandas as pd
import numpy as np
# Sample messy data
df = pd.DataFrame({
'Name': ['alice', 'BOB', 'Charlie', None, 'david'],
'Age': [25, 30, np.nan, 35, 28],
'Email': ['alice@example.com', 'bob@test', 'charlie@example.com', 'invalid', 'david@example.com'],
'Salary': ['50000', '60k', '70000', '55000', '65000']
})
# Clean names
df['Name'] = df['Name'].str.title()
df['Name'].fillna('Unknown', inplace=True)
# Clean ages
df['Age'].fillna(df['Age'].median(), inplace=True)
# Validate emails
df['Valid_Email'] = df['Email'].str.contains('@.+\..+', na=False)
# Clean salaries (remove 'k', convert to int)
df['Salary'] = df['Salary'].str.replace('k', '000').astype(float)
print(df)
10.2 Sales Analysis
import pandas as pd
# Sample sales data
df = pd.DataFrame({
'Date': pd.date_range('2024-01-01', periods=100),
'Product': np.random.choice(['A', 'B', 'C'], 100),
'Sales': np.random.randint(100, 1000, 100),
'Region': np.random.choice(['North', 'South', 'East', 'West'], 100)
})
# Total sales by product
product_sales = df.groupby('Product')['Sales'].sum().sort_values(ascending=False)
print(product_sales)
# Average sales by region
region_avg = df.groupby('Region')['Sales'].mean()
print(region_avg)
# Monthly sales
df['Month'] = df['Date'].dt.month
monthly_sales = df.groupby('Month')['Sales'].sum()
print(monthly_sales)
# Top performing product-region combinations
top_combos = df.groupby(['Product', 'Region'])['Sales'].sum().sort_values(ascending=False).head(5)
print(top_combos)
11. Export Data
# Save to CSV
df.to_csv('output.csv', index=False)
# Save to Excel
df.to_excel('output.xlsx', index=False, sheet_name='Data')
# Save to JSON
df.to_json('output.json', orient='records')
# Save to SQL
import sqlite3
conn = sqlite3.connect('database.db')
df.to_sql('table_name', conn, if_exists='replace', index=False)
12. Performance Tips
# Use categorical data for memory efficiency
df['Category'] = df['Category'].astype('category')
# Use vectorized operations instead of loops
# ❌ Slow
for idx, row in df.iterrows():
df.at[idx, 'New_Col'] = row['A'] * 2
# ✅ Fast
df['New_Col'] = df['A'] * 2
# Chunk large files
for chunk in pd.read_csv('large_file.csv', chunksize=10000):
process(chunk)
Summary
✅ Pandas provides DataFrame for tabular data
✅ Read/write CSV, Excel, JSON, SQL easily
✅ Powerful filtering, grouping, and aggregation
✅ Handle missing data gracefully
✅ Merge and join datasets like SQL
✅ Built on NumPy for performance
Next Steps
In Module 27, you'll learn:
- Web scraping with BeautifulSoup
- Parsing HTML content
- Extracting data from websites
- Ethical scraping practices
Practice Exercises
- Load a CSV file and perform basic exploratory data analysis
- Clean a messy dataset (handle missing values, duplicates, outliers)
- Merge two datasets and analyze combined data
- Create a sales report with grouping and pivot tables
- Build a data pipeline that reads, cleans, transforms, and exports data
Challenge
Create a comprehensive data analysis tool that:
- Loads data from multiple sources (CSV, Excel, SQL)
- Automatically detects and handles missing values
- Identifies outliers using statistical methods
- Generates summary statistics and visualizations
- Creates pivot tables and cross-tabulations
- Exports clean data and reports in multiple formats